- Source: Florida Department of Corrections — “Inmate Mortality — Statistics and Publications” https://www.fdc.myflorida.com/statistics-and-publications/inmate-mortality
- Retrieved: 2025-08-10 14:07:00 (UTC)
- Why snapshot: Preserves exactly what FDOC published on the retrieval date for reproducibility.
- Method: Read the first HTML table from the saved snapshot and export to CSV. No manual edits here; any cleaning happens later.
from io import StringIO
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
SNAPSHOT_PATH = Path("fdoc_deaths/data/Inmate Mortality _ Statistics and Publications - Florida Department of Corrections.html")
OUT_CSV = Path("fdoc_deaths/data/FDOC's Mortality Table.csv")
with SNAPSHOT_PATH.open(encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# Try the first <table>; fall back to scanning the whole document
table = soup.find("table")
if table is not None:
df = pd.read_html(StringIO(str(table)))[0]
else:
tables = pd.read_html(StringIO(str(soup)))
if not len(tables):
raise ValueError(f"No <table> elements found in snapshot: {SNAPSHOT_PATH}")
df = tables[0]
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print(f"Saved CSV to {OUT_CSV} with shape {df.shape}")
Saved CSV to fdoc_deaths/data/FDOC's Mortality Table.csv with shape (6, 8)
from IPython.display import Image, display, Markdown
IMG_PATH = "assets/FDOH Screenshot 2025-08-05 at 9.53.12 AM.png"
# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Death/Rate"
QUERY_TEXT = "Crude Death Rate by Year, Homicide by Firearms Discharge, Homicide by Other and Unspecified Means"
RETRIEVED = "2025-08-05 at 13.53.12 UTC"
# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))
# Caption below the image
display(Markdown(
f"Source: [FDOH data portal]({FDOH_URL}) \n"
f"Query: {QUERY_TEXT} \n"
f"Retrieved: {RETRIEVED}"
))
Source: FDOH data portal
Query: Crude Death Rate by Year, Homicide by Firearms Discharge, Homicide by Other and Unspecified Means
Retrieved: 2025-08-05 at 13.53.12 UTC
df_m = pd.read_csv("fdoc_deaths/data/FDOC's Mortality Table.csv")
#print(df_m)
df_m = df_m[df_m['Inmate Population on June 30th'] != 0]
# Florida Yearly Homicde rates from screenshot above
fl_rates = [6.3, 7.0, 6.6, 6.6, 5.9]
df_m['Homicide Rate'] = (df_m['Homicide'] / df_m['Inmate Population on June 30th'] * 100000).round(1)
df_m['FL Statewide Homicide Rate'] = fl_rates
df_m.drop(['Accident', 'Suicide', 'Pending', "Natural*", "Total"], axis=1, inplace=True)
mean_fd_homicide = df_m['Homicide Rate'].mean().round(1)
mean_fl_homicide = df_m['FL Statewide Homicide Rate'].mean().round(1)
# Build a summary row
summary_row = {
'Fiscal Year': 'MEAN',
'Homicide': '',
'Inmate Population on June 30th': '',
'Homicide Rate': mean_fd_homicide,
'FL Statewide Homicide Rate': mean_fl_homicide
}
# Append the row
df_m = pd.concat([df_m, pd.DataFrame([summary_row])], ignore_index=True)
table_html = df_m.to_html(index=False, classes='striped-table')
# Wrap it in a full HTML5 doc
full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>FDOC Mortality Table (2019–2024)</title>
<link rel="stylesheet" href="../style.css">
</head>
<body>
<h2>FDOC Annual Mortality Table (2019–2024)</h2>
{table_html}
</body>
</html>
"""
with open("fdoc_deaths/FDOC_mortality_table.html", "w") as f:
f.write(full_html)
“According to available data, globally there were 34.2 deaths by suicide of persons deprived of liberty per 100,000, which is higher than the suicide rate among the general population, which is 9.2 deaths per 100,000 inhabitants. On the other hand, the intentional homicide rate inside prisons is 12.2 per 100,000 persons deprived of liberty, while among the general population the rate is 5.8 intentional homicide deaths per 100,000 inhabitants. In other words, in most of the countries with available data, the rate of intentional homicide is higher inside prisons than among the general population, and the Americas is the region with the highest rate of intentional homicide deaths inside prisons, with 18.3 victims per 100,000 persons deprived of liberty.”
— UNODC (2024). Global prison population and trends: a focus on rehabilitation.
Source: https://www.cdeunodc.inegi.org.mx/unodc/index.php/2024/08/15/global-prison-population-and-trends-a-focus-on-rehabilitation/index.htm Accessed: 2025-08-10 17:34:00 UTC
Note: 12.2 per 100,000 = 0.122 per 1,000.
import altair as alt
data = pd.DataFrame({
'GroupShort': [
'FDOC Prison Homicide',
'Global Prison Homicide',
'FL State Homicide'
],
'GroupFull': [
'FDOC In-Prison Homicide (2019–24)',
'Global Prison Homicide (UNODC, 2024)',
'Florida Statewide Homicide (2019–24)'
],
'Rate': [20.35, 12.2, 6.48]
})
highlight = ['FDOC Prison Homicide']
chart = alt.Chart(data).mark_bar(
cornerRadiusEnd=8,
size=35
).encode(
x=alt.X(
'Rate:Q',
title='Deaths per 1,000,000',
axis=alt.Axis(
labelFontSize=13,
titleFontSize=13,
labelColor='white',
titleColor='white'
),
scale=alt.Scale(domain=[0, 22])
),
y=alt.Y(
'GroupShort:N',
sort='-x',
axis=alt.Axis(
title=None,
labelFontSize=13,
labelAlign='right',
labelPadding=12,
labelColor='white'
)
),
color=alt.condition(
alt.FieldOneOfPredicate(field="GroupShort", oneOf=highlight),
alt.value('#e8596a'),
alt.value('#166a6a')
),
tooltip=[
alt.Tooltip('GroupFull:N', title='Group'),
alt.Tooltip('Rate:Q', title='Deaths per 1,000,000', format='.1f')
]
).properties(
width=500, height=180
).configure_view(
stroke=None,
fill=None
).configure_axis(
grid=True
).configure(
background='rgba(0,0,0,0)'
)
chart.save("assets/deaths_per_1000000.json")
chart.display()
#bringing in the raw OBIS data, pulled 2025-08-11 from https://www.fdc.myflorida.com/statistics-and-publications/public-records-requests-for-the-obis-database
import os
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
def convert_xlsx_sheets_to_csvs(
xlsx_path,
output_dir=None,
prefix=None
):
"""Convert all sheets in an Excel file to CSV files. Prints the mapping."""
# Prepare output directory
if output_dir is None:
output_dir = os.path.dirname(os.path.abspath(xlsx_path))
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
if prefix is None:
prefix = os.path.splitext(os.path.basename(xlsx_path))[0]
# Load Excel file (all sheets)
xl = pd.ExcelFile(xlsx_path)
mapping = {}
for sheet in xl.sheet_names:
# Clean sheet name for filesystems
safe_sheet = ''.join(c if c.isalnum() or c in (' ','-','_') else '_' for c in sheet).rstrip()
outpath = os.path.join(output_dir, f"{prefix}__{safe_sheet}.csv")
df = xl.parse(sheet)
df.to_csv(outpath, index=False)
mapping[sheet] = outpath
print(f"Sheet '{sheet}' -> '{outpath}'")
return mapping
# --- Usage ---
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Active.XLSX', output_dir='./fdoc_deaths/csvs')
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Release.XLSX', output_dir='./fdoc_deaths/csvs')
Sheet 'Inmate Active Root' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv' Sheet 'Inmate Active Aliases' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Aliases.csv' Sheet 'Inmate Active Offenses CPS' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses CPS.csv' Sheet 'Inmate Active Offenses PRPR' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses PRPR.csv' Sheet 'Inmate Active Detainers' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Detainers.csv' Sheet 'Inmate Active Incarhist' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv' Sheet 'Inmate Release Root' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv' Sheet 'Inmate Release Alias_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_1.csv' Sheet 'Inmate Release Alias_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_2.csv' Sheet 'Inmate Release Alias_3' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_3.csv' Sheet 'Inmate Release Offenses CPS_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_1.csv' Sheet 'Inmate Release Offenses CPS_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_2.csv' Sheet 'Inmate Release Offenses PRPR_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_1.csv' Sheet 'Inmate Release Offenses PRPR_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_2.csv' Sheet 'Inmate Release Detainer' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Detainer.csv' Sheet 'Inmate Release Incarhist' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv'
# To build a population pyramid of the FODC, we must first determine who was incarcerated when
df_incarcerated_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_incarcerated_incarhist.info(), df_incarcerated_incarhist.head())
df_released_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_released_incarhist.info(), df_released_incarhist.head())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 196525 entries, 0 to 196524 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DCNumber 196525 non-null object 1 ReceiptDate 196525 non-null object 2 ReleaseDate 106400 non-null object dtypes: object(3) memory usage: 4.5+ MB None DCNumber ReceiptDate ReleaseDate 0 000111 2022-08-09 2023-09-18 1 000111 2024-05-16 NaN 2 000132 2018-10-08 NaN 3 000155 2024-05-23 NaN 4 000155 2018-10-24 2019-03-27 <class 'pandas.core.frame.DataFrame'> RangeIndex: 915242 entries, 0 to 915241 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DCNumber 915242 non-null object 1 ReceiptDate 915242 non-null object 2 ReleaseDate 915192 non-null object dtypes: object(3) memory usage: 20.9+ MB None DCNumber ReceiptDate ReleaseDate 0 47880 1954-04-02 1955-02-19 1 47880 1951-02-12 1953-09-28 2 53946 1954-11-27 1955-08-17 3 53946 1965-06-03 1965-06-13 4 56050 1955-11-09 1957-08-13
from datetime import datetime
df_all_hist = pd.concat([df_incarcerated_incarhist, df_released_incarhist], ignore_index=True)
df_all_hist['ReceiptDate'] = pd.to_datetime(df_all_hist['ReceiptDate'])
df_all_hist['ReleaseDate'] = pd.to_datetime(df_all_hist['ReleaseDate'])
# Fill ongoing incarcerations with extraction date - 'presently incarcerated'
DATA_PULL_DATE = pd.Timestamp('2025-08-11')
df_all_hist['ReleaseDate'] = df_all_hist['ReleaseDate'].fillna(DATA_PULL_DATE)
print(df_all_hist.info(), df_all_hist.head())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1111767 entries, 0 to 1111766 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DCNumber 1111767 non-null object 1 ReceiptDate 1111767 non-null datetime64[ns] 2 ReleaseDate 1111767 non-null datetime64[ns] dtypes: datetime64[ns](2), object(1) memory usage: 25.4+ MB None DCNumber ReceiptDate ReleaseDate 0 000111 2022-08-09 2023-09-18 1 000111 2024-05-16 2025-08-11 2 000132 2018-10-08 2025-08-11 3 000155 2024-05-23 2025-08-11 4 000155 2018-10-24 2019-03-27
df_all_hist = df_all_hist.sort_values(['DCNumber', 'ReceiptDate'])
# Group and produce lists of pd.Intervals
def make_intervals(subdf):
intervals = []
for rec, rel in zip(subdf['ReceiptDate'], subdf['ReleaseDate']):
if pd.notnull(rec) and pd.notnull(rel) and rec <= rel:
intervals.append(pd.Interval(left=rec, right=rel, closed='both'))
return intervals
grouped = (
df_all_hist
.sort_values(['DCNumber', 'ReceiptDate'])
.groupby('DCNumber', group_keys=False)[['ReceiptDate', 'ReleaseDate']]
.apply(make_intervals)
.reset_index(name='Incar_intervals')
)
print(grouped.head())
DCNumber \ 0 47880 1 53946 2 56050 3 000001 4 000004 Incar_intervals 0 [[1951-02-12 00:00:00, 1953-09-28 00:00:00], [1954-04-02 00:00:00, 1955-02-19 00:00:00]] 1 [[1954-11-27 00:00:00, 1955-08-17 00:00:00], [1965-06-03 00:00:00, 1965-06-13 00:00:00]] 2 [[1955-11-09 00:00:00, 1957-08-13 00:00:00], [1957-09-03 00:00:00, 1965-11-09 00:00:00], [1969-02-26 00:00:00, 1971-12-28 00:00:00], [1979-02-08 00:00:00, 1980-11-07 00:00:00], [1980-11-10 00:00:00, 1981-07-14 00:00:00]] 3 [[1960-07-01 00:00:00, 2005-04-24 00:00:00]] 4 [[1995-10-25 00:00:00, 1997-12-11 00:00:00]]
# Now that we have intervals of incarceration for everyone ever or currently incarcerated, we need to assoicate that with age/sex data
df_incarcerated_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_incarcerated_root.info(), df_incarcerated_root.head())
df_released_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_released_root.info(), df_released_root.head())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 90169 entries, 0 to 90168 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DCNumber 90169 non-null object 1 LastName 90168 non-null object 2 FirstName 90169 non-null object 3 MiddleName 68553 non-null object 4 NameSuffix 7372 non-null object 5 Race 90169 non-null object 6 Sex 90169 non-null category 7 BirthDate 90169 non-null object 8 PrisonReleaseDate 74057 non-null object 9 ReceiptDate 90169 non-null object 10 releasedateflag_descr 90169 non-null object 11 race_descr 90169 non-null object 12 custody_description 87249 non-null object 13 FACILITY_description 90169 non-null object dtypes: category(1), object(13) memory usage: 9.0+ MB None DCNumber LastName FirstName MiddleName NameSuffix Race Sex BirthDate \ 0 000111 NELSON ALVON L NaN B M 1998-03-11 1 000132 TELFAIR MICHAEL NaN NaN B M 1967-12-04 2 000155 LOCKETT JERRY I NaN B M 1985-09-26 3 000175 ELMORE CHAD R NaN W M 1982-10-20 4 000203 RHINESMITH AUSTIN A NaN W M 2000-09-12 PrisonReleaseDate ReceiptDate releasedateflag_descr race_descr \ 0 2025-09-10 00:00:00 2024-05-16 valid release date BLACK 1 2033-04-07 00:00:00 2018-10-08 valid release date BLACK 2 2026-07-23 00:00:00 2024-05-23 valid release date BLACK 3 2028-08-29 00:00:00 2025-06-11 valid release date WHITE 4 2028-01-08 00:00:00 2025-07-24 valid release date WHITE custody_description FACILITY_description 0 CLOSE SUWANNEE C.I 1 MINIMUM MADISON C.I. 2 MEDIUM GULF C.I. 3 MINIMUM JACKSON C.I. 4 NaN CFRC-MAIN <class 'pandas.core.frame.DataFrame'> RangeIndex: 470403 entries, 0 to 470402 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DCNumber 470403 non-null object 1 LastName 470389 non-null object 2 FirstName 470399 non-null object 3 MiddleName 342540 non-null object 4 NameSuffix 25548 non-null object 5 Race 470403 non-null object 6 Sex 470403 non-null category 7 BirthDate 470400 non-null object 8 PrisonReleaseDate 470403 non-null object 9 releasedateflag_descr 470403 non-null object 10 race_descr 470403 non-null object 11 custody_description 454906 non-null object dtypes: category(1), object(11) memory usage: 39.9+ MB None DCNumber LastName FirstName MiddleName NameSuffix Race Sex BirthDate \ 0 47880 MARTIN WILLARD NaN NaN W M 1915-06-17 1 53946 MEARNS WILLIAM W NaN W M 1922-06-01 2 56050 CARVER JESSIE NaN NaN B M 1932-10-16 3 000001 WHITNEY DENNIS NaN NaN W M 1942-08-15 4 000004 ROBBINS WAYNE NaN NaN W M 1969-01-13 PrisonReleaseDate releasedateflag_descr race_descr custody_description 0 1955-02-19 valid release date WHITE NaN 1 1965-06-13 valid release date WHITE NaN 2 2007-03-20 valid release date BLACK NaN 3 2005-04-24 deceased WHITE CLOSE 4 1997-12-11 valid release date WHITE MINIMUM
df_birth_incar = df_incarcerated_root[['DCNumber', 'BirthDate', 'Sex']].copy()
df_birth_released = df_released_root[['DCNumber', 'BirthDate', 'Sex']].copy()
df_birth = pd.concat([df_birth_incar, df_birth_released], ignore_index=True)
df_birth = df_birth.drop_duplicates(subset='DCNumber')
df_birth['BirthDate'] = pd.to_datetime(df_birth['BirthDate'])
print("Min birthdate:", df_birth['BirthDate'].min())
print("Max birthdate:", df_birth['BirthDate'].max())
print("Null birthdays:", df_birth['BirthDate'].isna().sum())
print(df_birth[df_birth['BirthDate'].isna()].head())
#Null birthdays verified null in https://pubapps.fdc.myflorida.com/OffenderSearch/InmateInfoMenu.aspx
Min birthdate: 1906-01-14 00:00:00 Max birthdate: 2010-01-19 00:00:00 Null birthdays: 3 DCNumber BirthDate Sex 252785 969628 NaT M 360654 K02167 NaT M 549686 Y13143 NaT M
df_birth.head()
DCNumber | BirthDate | Sex | |
---|---|---|---|
0 | 000111 | 1998-03-11 | M |
1 | 000132 | 1967-12-04 | M |
2 | 000155 | 1985-09-26 | M |
3 | 000175 | 1982-10-20 | M |
4 | 000203 | 2000-09-12 | M |
# Ensure DCNumber is of the same type in both DataFrames
df_birth['DCNumber'] = df_birth['DCNumber'].astype(str).str.strip()
grouped['DCNumber'] = grouped['DCNumber'].astype(str).str.strip()
# Perform a left join:
df_merged = grouped.merge(df_birth, on='DCNumber', how='left')
# We now get counts per-year per-population group
df = df_merged
# Reference dates: June 30 each year
ref_dates = {year: pd.Timestamp(year=year, month=6, day=30) for year in range(2019, 2025)}
# Helpers
def was_incarcerated_on(intervals, on_date):
if not isinstance(intervals, (list, tuple)) or len(intervals) == 0:
return False
for iv in intervals:
try:
left, right = iv.left, iv.right
except AttributeError:
left, right = iv
if left <= on_date <= right:
return True
return False
def age_on(birth, on_date):
if pd.isna(birth):
return pd.NA
return (on_date.year - birth.year) - ( (on_date.month, on_date.day) < (birth.month, birth.day) )
# Ensure BirthDate is datetime
if not pd.api.types.is_datetime64_any_dtype(df['BirthDate']):
df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')
# Compute age columns
for year, d in ref_dates.items():
col = f'age_in_{year}'
df[col] = df.apply(
lambda r: age_on(r['BirthDate'], d) if was_incarcerated_on(r['Incar_intervals'], d) else pd.NA,
axis=1
).astype('Int64') # nullable integer dtype
# sanity check on total population count
df[['age_in_2024']].count()
age_in_2024 88376 dtype: int64
Note: FDOC’s published “active population” tallies count only those physically present at a state facility on snapshot day. This reconstruction, using incarceration intervals, includes all individuals with an interval covering the census date, regardless of temporary status (e.g., out to court, hospitalized, etc.). This slightly overstates the population compared to FDOC snapshot counts, which is a well-known, standard limitation for historical administrative interval analysis.
from pandas.api.types import CategoricalDtype
age_bins = [0, 1, 5, 10, 15, 20, 25, 35, 45, 55, 65, 75, 85, 200]
age_labels = [
"<1", "1-4", "5-9", "10-14", "15-19",
"20-24", "25-34", "35-44", "45-54", "55-64",
"65-74", "75-84", "85+"
]
# Remove unused categories up front
if isinstance(df['Sex'].dtype, CategoricalDtype):
df['Sex'] = df['Sex'].cat.remove_unused_categories()
dfs = {}
for year in range(2019, 2025):
age_col = f'age_in_{year}'
bins = pd.cut(df[age_col], bins=age_bins, labels=age_labels, right=False, include_lowest=True)
temp = pd.DataFrame({'age_bin': bins, 'Sex': df['Sex']})
# Group and pivot; observed=True to avoid unused category columns
group = temp.groupby(['age_bin', 'Sex'], observed=True).size().unstack(fill_value=0)
# Reset index so 'age_bin' is a column again, and manually clear any index/column names
group = group.reset_index()
group.columns.name = None
group = group.rename(columns={'M': 'Male', 'F': 'Female'})
dfs[year] = group
print(f"\nAge and sex distribution for {year}:")
print(group)
Age and sex distribution for 2019: age_bin Male Female 0 10-14 2 0 1 15-19 794 32 2 20-24 6733 420 3 25-34 26604 2284 4 35-44 23963 2049 5 45-54 17252 1224 6 55-64 10972 564 7 65-74 3254 123 8 75-84 618 18 9 85+ 57 0 Age and sex distribution for 2020: age_bin Female Male 0 15-19 27 561 1 20-24 341 5428 2 25-34 1872 23421 3 35-44 1820 22493 4 45-54 1115 16071 5 55-64 555 10885 6 65-74 127 3393 7 75-84 20 657 8 85+ 1 59 Age and sex distribution for 2021: age_bin Female Male 0 15-19 18 427 1 20-24 280 4552 2 25-34 1566 20518 3 35-44 1622 21212 4 45-54 934 14828 5 55-64 518 10561 6 65-74 122 3489 7 75-84 17 682 8 85+ 2 53 Age and sex distribution for 2022: age_bin Male Female 0 10-14 1 0 1 15-19 424 23 2 20-24 4334 282 3 25-34 20520 1587 4 35-44 21973 1741 5 45-54 15132 973 6 55-64 10866 531 7 65-74 3791 127 8 75-84 769 21 9 85+ 58 1 Age and sex distribution for 2023: age_bin Female Male 0 15-19 18 480 1 20-24 298 4393 2 25-34 1647 20467 3 35-44 2031 23119 4 45-54 1110 15734 5 55-64 564 11140 6 65-74 138 4135 7 75-84 27 878 8 85+ 2 70 Age and sex distribution for 2024: age_bin Female Male 0 15-19 20 504 1 20-24 309 4359 2 25-34 1706 20028 3 35-44 2149 23926 4 45-54 1210 16350 5 55-64 618 11442 6 65-74 148 4519 7 75-84 26 976 8 85+ 4 82
# Pull in FDOH data on state population from https://www.flhealthcharts.gov/FLQUERY_New/Population/Count
df_state = pd.read_csv("fdoc_deaths/csvs/Population_081220251605.csv", header=1)
df_state = df_state.iloc[2:].copy()
df_state = df_state.rename(columns={'Unnamed: 0': 'age_bin'})
df_state = df_state.reset_index(drop=True)
cols = {
'female_2019': 1, 'male_2019': 2,
'female_2020': 4, 'male_2020': 5,
'female_2021': 7, 'male_2021': 8,
'female_2022':10, 'male_2022':11,
'female_2023':13, 'male_2023':14,
}
df_age_sex = pd.DataFrame()
df_age_sex['age_bin'] = df_state['age_bin']
# Assign and clean each desired column
for newcol, idx in cols.items():
df_age_sex[newcol] = (
df_state.iloc[:, idx]
)
# Remove any "Total" or extra summary rows
df_age_sex = df_age_sex[~df_age_sex['age_bin'].str.contains("Total", na=False)].copy()
for col in df_age_sex.columns:
if col != 'age_bin': # don't convert bin labels!
df_age_sex[col] = pd.to_numeric(df_age_sex[col])
print(df_age_sex.info(),df_age_sex)
<class 'pandas.core.frame.DataFrame'> Index: 13 entries, 0 to 12 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age_bin 13 non-null object 1 female_2019 13 non-null int64 2 male_2019 13 non-null int64 3 female_2020 13 non-null int64 4 male_2020 13 non-null int64 5 female_2021 13 non-null int64 6 male_2021 13 non-null int64 7 female_2022 13 non-null int64 8 male_2022 13 non-null int64 9 female_2023 13 non-null int64 10 male_2023 13 non-null int64 dtypes: int64(10), object(1) memory usage: 1.2+ KB None age_bin female_2019 male_2019 female_2020 male_2020 female_2021 \ 0 <1 105837 110740 100144 105001 103515 1 1-4 452146 473774 461835 483003 459749 2 5-9 565917 590432 574982 600993 583720 3 10-14 590585 614014 595752 619774 601845 4 15-19 588355 616780 592774 619696 594807 5 20-24 620093 649481 621957 648074 629862 6 25-34 1370805 1417463 1388821 1437525 1403076 7 35-44 1294285 1273377 1321951 1302602 1349385 8 45-54 1398322 1345694 1393406 1339768 1390803 9 55-64 1500802 1348036 1533563 1384124 1561551 10 65-74 1286140 1103480 1334870 1141115 1386162 11 75-84 755539 627404 795562 659897 830232 12 85+ 342951 226101 348827 234750 357797 male_2021 female_2022 male_2022 female_2023 male_2023 0 108035 108250 112813 106527 111631 1 478846 446345 465464 446903 465106 2 610256 592247 619869 591461 619054 3 624646 624767 654269 613277 643749 4 623517 617216 648019 632772 663571 5 651645 633981 651718 678944 699869 6 1454968 1401101 1438657 1405448 1460442 7 1335878 1388739 1400465 1407017 1437365 8 1337748 1398209 1374181 1396753 1385056 9 1415552 1578259 1473402 1569641 1477506 10 1180546 1418697 1231516 1418872 1227549 11 687257 818514 684502 884067 731385 12 244189 324276 223702 371580 240038
years = range(2019, 2024)
merged_dfs = {}
for year in years:
# Get FDOC population by sex/age for the year
df_fdoc = dfs[year].copy()
# Get state population columns for the year
mcol = f'male_{year}'
fcol = f'female_{year}'
# Merge on 'age_bin'
merged = df_fdoc.merge(
df_age_sex[['age_bin', fcol, mcol]], on='age_bin', how='left'
)
# Rename columns
merged = merged.rename(columns={
'Male': f'fdoc_male_{year}',
'Female': f'fdoc_female_{year}',
fcol: f'flpop_female_{year}',
mcol: f'flpop_male_{year}'
})
# Put columns in standard order
merged = merged[['age_bin', f'fdoc_male_{year}', f'fdoc_female_{year}', f'flpop_male_{year}', f'flpop_female_{year}']]
merged_dfs[year] = merged
# Preview result
print(f"\nMerged distribution for {year}:")
print(merged)
Merged distribution for 2019: age_bin fdoc_male_2019 fdoc_female_2019 flpop_male_2019 \ 0 10-14 2 0 614014 1 15-19 794 32 616780 2 20-24 6733 420 649481 3 25-34 26604 2284 1417463 4 35-44 23963 2049 1273377 5 45-54 17252 1224 1345694 6 55-64 10972 564 1348036 7 65-74 3254 123 1103480 8 75-84 618 18 627404 9 85+ 57 0 226101 flpop_female_2019 0 590585 1 588355 2 620093 3 1370805 4 1294285 5 1398322 6 1500802 7 1286140 8 755539 9 342951 Merged distribution for 2020: age_bin fdoc_male_2020 fdoc_female_2020 flpop_male_2020 \ 0 15-19 561 27 619696 1 20-24 5428 341 648074 2 25-34 23421 1872 1437525 3 35-44 22493 1820 1302602 4 45-54 16071 1115 1339768 5 55-64 10885 555 1384124 6 65-74 3393 127 1141115 7 75-84 657 20 659897 8 85+ 59 1 234750 flpop_female_2020 0 592774 1 621957 2 1388821 3 1321951 4 1393406 5 1533563 6 1334870 7 795562 8 348827 Merged distribution for 2021: age_bin fdoc_male_2021 fdoc_female_2021 flpop_male_2021 \ 0 15-19 427 18 623517 1 20-24 4552 280 651645 2 25-34 20518 1566 1454968 3 35-44 21212 1622 1335878 4 45-54 14828 934 1337748 5 55-64 10561 518 1415552 6 65-74 3489 122 1180546 7 75-84 682 17 687257 8 85+ 53 2 244189 flpop_female_2021 0 594807 1 629862 2 1403076 3 1349385 4 1390803 5 1561551 6 1386162 7 830232 8 357797 Merged distribution for 2022: age_bin fdoc_male_2022 fdoc_female_2022 flpop_male_2022 \ 0 10-14 1 0 654269 1 15-19 424 23 648019 2 20-24 4334 282 651718 3 25-34 20520 1587 1438657 4 35-44 21973 1741 1400465 5 45-54 15132 973 1374181 6 55-64 10866 531 1473402 7 65-74 3791 127 1231516 8 75-84 769 21 684502 9 85+ 58 1 223702 flpop_female_2022 0 624767 1 617216 2 633981 3 1401101 4 1388739 5 1398209 6 1578259 7 1418697 8 818514 9 324276 Merged distribution for 2023: age_bin fdoc_male_2023 fdoc_female_2023 flpop_male_2023 \ 0 15-19 480 18 663571 1 20-24 4393 298 699869 2 25-34 20467 1647 1460442 3 35-44 23119 2031 1437365 4 45-54 15734 1110 1385056 5 55-64 11140 564 1477506 6 65-74 4135 138 1227549 7 75-84 878 27 731385 8 85+ 70 2 240038 flpop_female_2023 0 632772 1 678944 2 1405448 3 1407017 4 1396753 5 1569641 6 1418872 7 884067 8 371580
df = merged_dfs[2023].copy()
for col in ['fdoc_male_2023', 'fdoc_female_2023', 'flpop_male_2023', 'flpop_female_2023']:
df[col] = pd.to_numeric(df[col], errors='coerce')
df['fdoc_female_2023_neg'] = -df['fdoc_female_2023']
df['flpop_female_2023_neg'] = -df['flpop_female_2023']
df['age_bin'] = pd.Categorical(df['age_bin'], categories=df['age_bin'].tolist(), ordered=True)
# Global config for dark mode and bar gaps
alt.themes.enable('none')
custom_config = {
"config": {
"background": 'rgba(0,0,0,0)',
"axis": {
"domainColor": "white",
"gridColor": "white",
"labelColor": "white",
"tickColor": "white",
"titleColor": "white"
},
"title": {"color": "white"},
"legend": {"labelColor": "white", "titleColor": "white"},
"view": {"stroke": "white"},
"bar": {"binSpacing": 0}
}
}
alt.themes.register('johnwise_black', lambda: custom_config)
alt.themes.enable('johnwise_black')
#### --- FDOC Chart ---
base_fdoc = alt.Chart(df).encode(
y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)
male_fdoc = base_fdoc.mark_bar(color="#348ABD", size = 35).encode(
x=alt.X('fdoc_male_2023:Q', title='FDOC Male'),
tooltip=['age_bin', alt.Tooltip('fdoc_male_2023')]
)
female_fdoc = base_fdoc.mark_bar(color="#E24A33", size = 35).encode(
x=alt.X('fdoc_female_2023_neg:Q', title='FDOC Female',
axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
tooltip=['age_bin', alt.Tooltip('fdoc_female_2023')]
)
pyramid_fdoc = (female_fdoc + male_fdoc).properties(
title=alt.TitleParams('FDOC 2023 Prison Population Pyramid', color='white'),
width=260, height=320
)
#### --- Statewide Chart ---
base_state = alt.Chart(df).encode(
y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)
male_state = base_state.mark_bar(color="#348ABD", size = 35).encode(
x=alt.X('flpop_male_2023:Q', title='FL Male'),
tooltip=['age_bin', alt.Tooltip('flpop_male_2023')]
)
female_state = base_state.mark_bar(color="#E24A33", size = 35).encode(
x=alt.X('flpop_female_2023_neg:Q', title='FL Female',
axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
tooltip=['age_bin', alt.Tooltip('flpop_female_2023')]
)
pyramid_state = (female_state + male_state).properties(
title=alt.TitleParams('Florida 2023 Population Pyramid', color='white'),
width=260, height=320
)
chart = alt.vconcat(
pyramid_fdoc,
pyramid_state
)
chart.save("assets/double_pop_pyramid.json")
chart
# information on individual deaths pulled from yearly queries https://www.fdc.myflorida.com/statistics-and-publications/inmate-mortality
df19 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2019-2020.csv")
df20 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2020-2021.csv")
df21 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2021-2022.csv")
df22 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2022-2023.csv")
df23 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2023-2024.csv")
df_all_FY = pd.concat([df19, df20, df21, df22, df23], ignore_index = True)
print(df_all_FY.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2260 entries, 0 to 2259 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 2148 non-null object 1 DC Number 2148 non-null object 2 Date of Death 2260 non-null object 3 Institution Name 2260 non-null object 4 Manner of Death Determined by ME 2260 non-null object 5 Investigative Status 2260 non-null object dtypes: object(6) memory usage: 106.1+ KB None
# tidy
df_deaths_19_23 = df_all_FY[['Name', 'DC Number', 'Date of Death']].copy()
# Rename 'DC Number' to 'DCNumber'
df_deaths_19_23.rename(columns={'DC Number': 'DCNumber'}, inplace=True)
df_deaths_19_23['Date of Death'] = pd.to_datetime(df_deaths_19_23['Date of Death'], format = '%m/%d/%Y')
print(df_deaths_19_23.head(), df_deaths_19_23.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2260 entries, 0 to 2259 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 2148 non-null object 1 DCNumber 2148 non-null object 2 Date of Death 2260 non-null datetime64[ns] dtypes: datetime64[ns](1), object(2) memory usage: 53.1+ KB Name DCNumber Date of Death 0 NaN NaN 2019-07-01 1 HORTON, MICHAEL 073661 2019-07-02 2 NaN NaN 2019-07-05 3 FERGUSON, LEROY 716169 2019-07-06 4 BUSSER, WILLIAM N26218 2019-07-06 None
# Join to previous data
df_deaths_births_19_23 = pd.merge(df_deaths_19_23, df_birth, on='DCNumber', how='left')
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2260 entries, 0 to 2259 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 2148 non-null object 1 DCNumber 2148 non-null object 2 Date of Death 2260 non-null datetime64[ns] 3 BirthDate 2145 non-null datetime64[ns] 4 Sex 2145 non-null object dtypes: datetime64[ns](2), object(3) memory usage: 88.4+ KB None Name DCNumber Date of Death BirthDate Sex 0 NaN NaN 2019-07-01 NaT NaN 1 HORTON, MICHAEL 073661 2019-07-02 1952-07-19 M 2 NaN NaN 2019-07-05 NaT NaN 3 FERGUSON, LEROY 716169 2019-07-06 1960-11-29 M 4 BUSSER, WILLIAM N26218 2019-07-06 1937-07-17 M
# investigate completeness
missing_birthdate_rows = df_deaths_births_19_23[
(df_deaths_births_19_23['DCNumber'].notna()) &
(df_deaths_births_19_23['BirthDate'].isna())
]
print(missing_birthdate_rows)
missing_birthdate_rows2 = df_deaths_births_19_23[
(df_deaths_births_19_23['BirthDate'].notna()) &
(df_deaths_births_19_23['DCNumber'].isna())
]
print(missing_birthdate_rows2)
Name DCNumber Date of Death BirthDate Sex 799 BAILEY, ROMULUS 168749 2021-01-05 NaT NaN 1551 JORDAN, JAMES 555250 2022-11-02 NaT NaN 2032 CHEATHAM, GERALD 083980 2023-11-28 NaT NaN Empty DataFrame Columns: [Name, DCNumber, Date of Death, BirthDate, Sex] Index: []
The above names and DC Numbers were searched for in the FDOC Offender Database https://pubapps.fdc.myflorida.com/OffenderSearch/search.aspx and no conclusive matches were found
'''To account for missing birthdate data (~5% of deaths, presumed to be evenly distributed across ages and sex), the age/sex-specific
population denominators are multiplied by 95% to match the coverage rate of the numerator.'''
adj = 1- df_deaths_births_19_23[df_deaths_births_19_23['BirthDate'].isna()].shape[0]/df_all_FY.shape[0]
print(adj)
0.9491150442477876
df_deaths_births_19_23.dropna(subset =['BirthDate'], inplace = True)
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())
<class 'pandas.core.frame.DataFrame'> Index: 2145 entries, 1 to 2258 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 2145 non-null object 1 DCNumber 2145 non-null object 2 Date of Death 2145 non-null datetime64[ns] 3 BirthDate 2145 non-null datetime64[ns] 4 Sex 2145 non-null object dtypes: datetime64[ns](2), object(3) memory usage: 100.5+ KB None Name DCNumber Date of Death BirthDate Sex 1 HORTON, MICHAEL 073661 2019-07-02 1952-07-19 M 3 FERGUSON, LEROY 716169 2019-07-06 1960-11-29 M 4 BUSSER, WILLIAM N26218 2019-07-06 1937-07-17 M 5 GIGLIO, DUKE 053611 2019-07-06 1954-12-16 M 6 COPPERTINO, JOHN 341241 2019-07-07 1952-05-21 M
# calculate age at death
df_deaths_births_19_23['age_at_death'] = df_deaths_births_19_23.apply(
lambda r: age_on(r['BirthDate'], r['Date of Death']),
axis=1
).astype('Int64')
print(df_deaths_births_19_23.head())
Name DCNumber Date of Death BirthDate Sex age_at_death 1 HORTON, MICHAEL 073661 2019-07-02 1952-07-19 M 66 3 FERGUSON, LEROY 716169 2019-07-06 1960-11-29 M 58 4 BUSSER, WILLIAM N26218 2019-07-06 1937-07-17 M 81 5 GIGLIO, DUKE 053611 2019-07-06 1954-12-16 M 64 6 COPPERTINO, JOHN 341241 2019-07-07 1952-05-21 M 67
deaths_by_year = {}
for year in range(2019, 2024):
mask = df_deaths_births_19_23['Date of Death'].dt.year == year
df_year = df_deaths_births_19_23[mask].copy()
df_year['age_bin'] = pd.cut(
df_year['age_at_death'],
bins=age_bins,
labels=age_labels,
right=False,
include_lowest=True
)
grouped = (
df_year.groupby(['age_bin', 'Sex'], observed=True)
.size()
.unstack(fill_value=0)
.reset_index()
)
grouped.columns.name = None
grouped = grouped.rename(columns={'M': f'male_deaths_{year}', 'F': f'female_deaths_{year}'})
deaths_by_year[year] = grouped
print(f"\nDeaths by age and sex, {year}:")
print(grouped)
Deaths by age and sex, 2019: age_bin female_deaths_2019 male_deaths_2019 0 25-34 1 11 1 35-44 1 14 2 45-54 1 30 3 55-64 1 46 4 65-74 0 52 5 75-84 1 27 6 85+ 0 3 Deaths by age and sex, 2020: age_bin female_deaths_2020 male_deaths_2020 0 20-24 0 2 1 25-34 3 18 2 35-44 4 47 3 45-54 1 63 4 55-64 6 146 5 65-74 5 171 6 75-84 3 86 7 85+ 0 17 Deaths by age and sex, 2021: age_bin female_deaths_2021 male_deaths_2021 0 25-34 1 18 1 35-44 2 34 2 45-54 4 48 3 55-64 1 131 4 65-74 2 108 5 75-84 1 59 6 85+ 0 11 Deaths by age and sex, 2022: age_bin female_deaths_2022 male_deaths_2022 0 25-34 1 20 1 35-44 3 31 2 45-54 1 53 3 55-64 0 103 4 65-74 2 95 5 75-84 0 52 6 85+ 0 8 Deaths by age and sex, 2023: age_bin female_deaths_2023 male_deaths_2023 0 20-24 1 4 1 25-34 1 26 2 35-44 1 35 3 45-54 1 54 4 55-64 4 108 5 65-74 4 101 6 75-84 3 65 7 85+ 0 19
final_merged = {}
for year in range(2019, 2024):
popdf = merged_dfs[year]
deathsdf = deaths_by_year[year]
merged = popdf.merge(deathsdf, on='age_bin', how='left')
for sexcol in [f'male_deaths_{year}', f'female_deaths_{year}']:
if sexcol in merged.columns:
merged[sexcol] = merged[sexcol].fillna(0).astype(int)
final_merged[year] = merged
print(f"\nMerged table for {year}:")
print(merged)
Merged table for 2019: age_bin fdoc_male_2019 fdoc_female_2019 flpop_male_2019 \ 0 10-14 2 0 614014 1 15-19 794 32 616780 2 20-24 6733 420 649481 3 25-34 26604 2284 1417463 4 35-44 23963 2049 1273377 5 45-54 17252 1224 1345694 6 55-64 10972 564 1348036 7 65-74 3254 123 1103480 8 75-84 618 18 627404 9 85+ 57 0 226101 flpop_female_2019 female_deaths_2019 male_deaths_2019 0 590585 0 0 1 588355 0 0 2 620093 0 0 3 1370805 1 11 4 1294285 1 14 5 1398322 1 30 6 1500802 1 46 7 1286140 0 52 8 755539 1 27 9 342951 0 3 Merged table for 2020: age_bin fdoc_male_2020 fdoc_female_2020 flpop_male_2020 \ 0 15-19 561 27 619696 1 20-24 5428 341 648074 2 25-34 23421 1872 1437525 3 35-44 22493 1820 1302602 4 45-54 16071 1115 1339768 5 55-64 10885 555 1384124 6 65-74 3393 127 1141115 7 75-84 657 20 659897 8 85+ 59 1 234750 flpop_female_2020 female_deaths_2020 male_deaths_2020 0 592774 0 0 1 621957 0 2 2 1388821 3 18 3 1321951 4 47 4 1393406 1 63 5 1533563 6 146 6 1334870 5 171 7 795562 3 86 8 348827 0 17 Merged table for 2021: age_bin fdoc_male_2021 fdoc_female_2021 flpop_male_2021 \ 0 15-19 427 18 623517 1 20-24 4552 280 651645 2 25-34 20518 1566 1454968 3 35-44 21212 1622 1335878 4 45-54 14828 934 1337748 5 55-64 10561 518 1415552 6 65-74 3489 122 1180546 7 75-84 682 17 687257 8 85+ 53 2 244189 flpop_female_2021 female_deaths_2021 male_deaths_2021 0 594807 0 0 1 629862 0 0 2 1403076 1 18 3 1349385 2 34 4 1390803 4 48 5 1561551 1 131 6 1386162 2 108 7 830232 1 59 8 357797 0 11 Merged table for 2022: age_bin fdoc_male_2022 fdoc_female_2022 flpop_male_2022 \ 0 10-14 1 0 654269 1 15-19 424 23 648019 2 20-24 4334 282 651718 3 25-34 20520 1587 1438657 4 35-44 21973 1741 1400465 5 45-54 15132 973 1374181 6 55-64 10866 531 1473402 7 65-74 3791 127 1231516 8 75-84 769 21 684502 9 85+ 58 1 223702 flpop_female_2022 female_deaths_2022 male_deaths_2022 0 624767 0 0 1 617216 0 0 2 633981 0 0 3 1401101 1 20 4 1388739 3 31 5 1398209 1 53 6 1578259 0 103 7 1418697 2 95 8 818514 0 52 9 324276 0 8 Merged table for 2023: age_bin fdoc_male_2023 fdoc_female_2023 flpop_male_2023 \ 0 15-19 480 18 663571 1 20-24 4393 298 699869 2 25-34 20467 1647 1460442 3 35-44 23119 2031 1437365 4 45-54 15734 1110 1385056 5 55-64 11140 564 1477506 6 65-74 4135 138 1227549 7 75-84 878 27 731385 8 85+ 70 2 240038 flpop_female_2023 female_deaths_2023 male_deaths_2023 0 632772 0 0 1 678944 1 4 2 1405448 1 26 3 1407017 1 35 4 1396753 1 54 5 1569641 4 108 6 1418872 4 101 7 884067 3 65 8 371580 0 19
# compute the projected death rate on the general FL populations from FDOC population rates
for year in range(2019, 2024):
for sex in ['male', 'female']:
deaths_col = f"{sex}_deaths_{year}"
pop_col = f"fdoc_{sex}_{year}"
flpop_col = f"flpop_{sex}_{year}"
proj_deaths_col = f"{sex}_deaths_proj"
final_merged[year][proj_deaths_col] = (
final_merged[year][deaths_col] / final_merged[year][pop_col] * final_merged[year][flpop_col] * adj
)
# sanity check
final_merged[2020]
age_bin | fdoc_male_2020 | fdoc_female_2020 | flpop_male_2020 | flpop_female_2020 | female_deaths_2020 | male_deaths_2020 | male_deaths_proj | female_deaths_proj | |
---|---|---|---|---|---|---|---|---|---|
0 | 15-19 | 561 | 27 | 619696 | 592774 | 0 | 0 | 0.000000 | 0.000000 |
1 | 20-24 | 5428 | 341 | 648074 | 621957 | 0 | 2 | 226.638461 | 0.000000 |
2 | 25-34 | 23421 | 1872 | 1437525 | 1388821 | 3 | 18 | 1048.579432 | 2112.421322 |
3 | 35-44 | 22493 | 1820 | 1302602 | 1321951 | 4 | 47 | 2583.337051 | 2757.546334 |
4 | 45-54 | 16071 | 1115 | 1339768 | 1393406 | 1 | 63 | 4984.781269 | 1186.100984 |
5 | 55-64 | 10885 | 555 | 1384124 | 1533563 | 6 | 146 | 17620.502074 | 15735.434752 |
6 | 65-74 | 3393 | 127 | 1141115 | 1334870 | 5 | 171 | 54583.392203 | 49879.732249 |
7 | 75-84 | 657 | 20 | 659897 | 795562 | 3 | 86 | 81983.809209 | 113261.979425 |
8 | 85+ | 59 | 1 | 234750 | 348827 | 0 | 17 | 64197.980726 | 0.000000 |
years = range(2019, 2024)
results = []
for year in years:
df = final_merged[year]
total_deaths_proj = df['male_deaths_proj'].sum() + df['female_deaths_proj'].sum()
total_population = df[f'flpop_male_{year}'].sum() + df[f'flpop_female_{year}'].sum()
yearly_rate = round(total_deaths_proj / total_population * 100000, 2)
results.append({'year': year, 'AADR': yearly_rate})
# Output DataFrame with rates per year
rates_df = pd.DataFrame(results)
print(rates_df)
# Mean across the 5 years
mean_aadr = round(rates_df['AADR'].mean(), 2)
print(f"\nMean AADR (2019–2023): {mean_aadr}")
year AADR 0 2019 566.76 1 2020 2277.23 2 2021 1313.25 3 2022 746.67 4 2023 1633.91 Mean AADR (2019–2023): 1307.56
IMG_PATH = "assets/Screenshot 2025-08-17 at 4.23.46 PM.png"
# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Population/Count"
QUERY_TEXT = "Crude Death Rate by Year, All causes"
RETRIEVED = "2025-08-17 at 22.23.46 UTC"
# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))
# Caption below the image
display(Markdown(
f"Source: [FDOH data portal]({FDOH_URL}) \n"
f"Query: {QUERY_TEXT} \n"
f"Retrieved: {RETRIEVED}"
))
Source: FDOH data portal
Query: Crude Death Rate by Year, All causes
Retrieved: 2025-08-17 at 22.23.46 UTC
df_fl_crude = pd.read_csv('fdoc_deaths/csvs/FDOH_Crude_death_rate.csv', header =2)
#print(df_fl_crude.info(), df_fl_crude.head())
# Reshape df_fl_crude from wide to long (pivot)
df_fl_crude_long = df_fl_crude.drop(columns=['Unnamed: 10'], errors='ignore').melt(
var_name='year', value_name='FL_Crude_Death_Rate'
)
# Filter to years of interest and ensure type
years = [2019, 2020, 2021, 2022, 2023]
df_fl_crude_long = df_fl_crude_long[df_fl_crude_long['year'].astype(int).isin(years)].copy()
df_fl_crude_long['year'] = df_fl_crude_long['year'].astype(int)
# Merge with rates dataframe
combined = rates_df.merge(df_fl_crude_long, on='year', how='left')
print(combined)
year AADR FL_Crude_Death_Rate 0 2019 566.76 973.2 1 2020 2277.23 1106.2 2 2021 1313.25 1187.2 3 2022 746.67 1070.1 4 2023 1633.91 1009.7
df_bar = combined.copy()
df_bar['Series'] = 'FDOC AADR'
df_bar = df_bar.rename(columns={'AADR': 'value'})
df_line = combined.copy()
df_line['Series'] = 'FL Crude Death Rate'
df_line = df_line.rename(columns={'FL_Crude_Death_Rate': 'value'})
long_df = pd.concat([df_bar[['year','Series','value']], df_line[['year','Series','value']]], ignore_index=True)
color_scale = alt.Scale(
domain=['FDOC AADR', 'FL Crude Death Rate'],
range=['#4bb4e6', '#ffa600']
)
base = alt.Chart(long_df).encode(
x=alt.X('year:O', title='Year')
)
bars = base.transform_filter(
alt.datum.Series == 'FDOC AADR'
).mark_bar(
size=40,
opacity=0.85,
cornerRadiusTopLeft=8,
cornerRadiusTopRight=8
).encode(
y=alt.Y('value:Q', title='Death Rate (per 100,000)'),
color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
tooltip=[
alt.Tooltip('year:O', title='Year'),
alt.Tooltip('value:Q', format='.2f', title='FDOC AADR')
]
)
line = base.transform_filter(
alt.datum.Series == 'FL Crude Death Rate'
).mark_line(
interpolate='monotone',
strokeWidth=3
).encode(
y='value:Q',
color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
tooltip=[
alt.Tooltip('year:O', title='Year'),
alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
]
)
points = base.transform_filter(
alt.datum.Series == 'FL Crude Death Rate'
).mark_circle(size=120).encode(
y='value:Q',
color=alt.Color('Series:N', scale=color_scale, legend=None),
tooltip=[
alt.Tooltip('year:O', title='Year'),
alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
]
)
mean_aadr = round(combined['AADR'].mean(), 2)
mean_rule = alt.Chart(pd.DataFrame({'mean_aadr': [mean_aadr]})).mark_rule(
color='red', strokeDash=[8,4], size=3
).encode(
y='mean_aadr:Q',
tooltip=[alt.Tooltip('mean_aadr:Q', format='.2f', title='Mean FDOC AADR')]
)
final_plot = (bars + line + points).properties(
width=500,
height=320,
title=alt.TitleParams(
'Florida Statewide Death Rate vs. FDOC Age Adjusted Death Rate (2019–2023)',
color='white'
)
)
final_plot.save("assets/FDOC_AADR_vs_FL_Crude.json")
final_plot