# Code Block 1
from io import StringIO
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup

SNAPSHOT_PATH = Path("fdoc_deaths/data/Inmate Mortality _ Statistics and Publications - Florida Department of Corrections.html")
OUT_CSV = Path("fdoc_deaths/data/FDOC's Mortality Table.csv")

with SNAPSHOT_PATH.open(encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# Try the first <table>; fall back to scanning the whole document
table = soup.find("table")
if table is not None:
    df = pd.read_html(StringIO(str(table)))[0]
else:
    tables = pd.read_html(StringIO(str(soup)))
    if not len(tables):
        raise ValueError(f"No <table> elements found in snapshot: {SNAPSHOT_PATH}")
    df = tables[0]

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print(f"Saved CSV to {OUT_CSV} with shape {df.shape}")

Saved CSV to fdoc_deaths/data/FDOC's Mortality Table.csv with shape (6, 8)

# Code Block 2
from IPython.display import Image, display, Markdown


IMG_PATH = "assets/FDOH Screenshot 2025-08-05 at 9.53.12 AM.png"

# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Death/Rate"
QUERY_TEXT = "Crude Death Rate by Year, Homicide by Firearms Discharge, Homicide by Other and Unspecified Means"
RETRIEVED = "2025-08-05 at 13.53.12 UTC"

# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))

# Caption below the image
display(Markdown(
    f"Source: [FDOH data portal]({FDOH_URL})  \n"
    f"Query: {QUERY_TEXT}  \n"
    f"Retrieved: {RETRIEVED}"
))

# Code Block 3
df_m = pd.read_csv("fdoc_deaths/data/FDOC's Mortality Table.csv")
#print(df_m)

df_m = df_m[df_m['Inmate Population on June 30th'] != 0]

# Florida Yearly Homicde rates from screenshot above
fl_rates = [6.3, 7.0, 6.6, 6.6, 5.9]

df_m['Homicide Rate'] = (df_m['Homicide'] / df_m['Inmate Population on June 30th'] * 100000).round(1)
df_m['FL Statewide Homicide Rate'] = fl_rates
df_m.drop(['Accident', 'Suicide', 'Pending', "Natural*", "Total"], axis=1, inplace=True)

mean_fd_homicide = df_m['Homicide Rate'].mean().round(1)
mean_fl_homicide = df_m['FL Statewide Homicide Rate'].mean().round(1)

# Build a summary row
summary_row = {
    'Fiscal Year': 'MEAN',
    'Homicide': '',
    'Inmate Population on June 30th': '',
    'Homicide Rate': mean_fd_homicide,
    'FL Statewide Homicide Rate': mean_fl_homicide
}

# Append the row
df_m = pd.concat([df_m, pd.DataFrame([summary_row])], ignore_index=True)

table_html = df_m.to_html(index=False, classes='striped-table')

# Wrap it in a full HTML5 doc
full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>FDOC Mortality Table (2019–2024)</title>
  <link rel="stylesheet" href="../style.css">
</head>
<body>
  <h2>FDOC Annual Mortality Table (2019–2024)</h2>
  {table_html}
</body>
</html>
"""

with open("fdoc_deaths/FDOC_mortality_table.html", "w") as f:
    f.write(full_html)

# Code Block 4
import altair as alt

data = pd.DataFrame({
    'GroupShort': [
        'FDOC Prison Homicide',
        'Global Prison Homicide',
        'FL State Homicide'
    ],
    'GroupFull': [
        'FDOC In-Prison Homicide (2019–24)',
        'Global Prison Homicide (UNODC, 2024)',
        'Florida Statewide Homicide (2019–24)'
    ],
    'Rate': [20.35, 12.2, 6.48]
})

highlight = ['FDOC Prison Homicide']

chart = alt.Chart(data).mark_bar(
    cornerRadiusEnd=8,
    size=35
).encode(
    x=alt.X(
        'Rate:Q',
        title='Deaths per 1,000,000',
        axis=alt.Axis(
            labelFontSize=13,
            titleFontSize=13,
            labelColor='white',
            titleColor='white'
        ),
        scale=alt.Scale(domain=[0, 22])
    ),
    y=alt.Y(
        'GroupShort:N',
        sort='-x',
        axis=alt.Axis(
            title=None,
            labelFontSize=13,
            labelAlign='right',
            labelPadding=12,
            labelColor='white'
        )
    ),
    color=alt.condition(
        alt.FieldOneOfPredicate(field="GroupShort", oneOf=highlight),
        alt.value('#e8596a'),
        alt.value('#166a6a')
    ),
    tooltip=[
        alt.Tooltip('GroupFull:N', title='Group'),
        alt.Tooltip('Rate:Q', title='Deaths per 1,000,000', format='.1f')
    ]
).properties(
    width=500, height=180
).configure_view(
    stroke=None,
    fill=None
).configure_axis(
    grid=True
).configure(
    background='rgba(0,0,0,0)'
)
chart.save("assets/deaths_per_1000000.json")
chart.display()

# Code Block 5
import os

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

def convert_xlsx_sheets_to_csvs(
    xlsx_path,
    output_dir=None,
    prefix=None
):
    """Convert all sheets in an Excel file to CSV files. Prints the mapping."""
    # Prepare output directory
    if output_dir is None:
        output_dir = os.path.dirname(os.path.abspath(xlsx_path))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if prefix is None:
        prefix = os.path.splitext(os.path.basename(xlsx_path))[0]
    # Load Excel file (all sheets)
    xl = pd.ExcelFile(xlsx_path)
    mapping = {}
    for sheet in xl.sheet_names:
        # Clean sheet name for filesystems
        safe_sheet = ''.join(c if c.isalnum() or c in (' ','-','_') else '_' for c in sheet).rstrip()
        outpath = os.path.join(output_dir, f"{prefix}__{safe_sheet}.csv")
        df = xl.parse(sheet)
        df.to_csv(outpath, index=False)
        mapping[sheet] = outpath
        print(f"Sheet '{sheet}' -> '{outpath}'")
    return mapping

# --- Usage ---
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Active.XLSX', output_dir='./fdoc_deaths/csvs')
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Release.XLSX', output_dir='./fdoc_deaths/csvs')

Sheet 'Inmate Active Root' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv'
Sheet 'Inmate Active Aliases' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Aliases.csv'
Sheet 'Inmate Active Offenses CPS' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses CPS.csv'
Sheet 'Inmate Active Offenses PRPR' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses PRPR.csv'
Sheet 'Inmate Active Detainers' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Detainers.csv'
Sheet 'Inmate Active Incarhist' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv'
Sheet 'Inmate Release Root' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv'
Sheet 'Inmate Release Alias_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_1.csv'
Sheet 'Inmate Release Alias_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_2.csv'
Sheet 'Inmate Release Alias_3' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_3.csv'
Sheet 'Inmate Release Offenses CPS_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_1.csv'
Sheet 'Inmate Release Offenses CPS_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_2.csv'
Sheet 'Inmate Release Offenses PRPR_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_1.csv'
Sheet 'Inmate Release Offenses PRPR_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_2.csv'
Sheet 'Inmate Release Detainer' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Detainer.csv'
Sheet 'Inmate Release Incarhist' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv'

# Code Block 6
df_incarcerated_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_incarcerated_incarhist.info(), df_incarcerated_incarhist.head())

df_released_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_released_incarhist.info(), df_released_incarhist.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196525 entries, 0 to 196524
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DCNumber     196525 non-null  object
 1   ReceiptDate  196525 non-null  object
 2   ReleaseDate  106400 non-null  object
dtypes: object(3)
memory usage: 4.5+ MB
None   DCNumber ReceiptDate ReleaseDate
0   000111  2022-08-09  2023-09-18
1   000111  2024-05-16         NaN
2   000132  2018-10-08         NaN
3   000155  2024-05-23         NaN
4   000155  2018-10-24  2019-03-27
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915242 entries, 0 to 915241
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DCNumber     915242 non-null  object
 1   ReceiptDate  915242 non-null  object
 2   ReleaseDate  915192 non-null  object
dtypes: object(3)
memory usage: 20.9+ MB
None   DCNumber ReceiptDate ReleaseDate
0    47880  1954-04-02  1955-02-19
1    47880  1951-02-12  1953-09-28
2    53946  1954-11-27  1955-08-17
3    53946  1965-06-03  1965-06-13
4    56050  1955-11-09  1957-08-13

# Code Block 7
from datetime import datetime

df_all_hist = pd.concat([df_incarcerated_incarhist, df_released_incarhist], ignore_index=True)


df_all_hist['ReceiptDate'] = pd.to_datetime(df_all_hist['ReceiptDate'])
df_all_hist['ReleaseDate'] = pd.to_datetime(df_all_hist['ReleaseDate'])


# Fill ongoing incarcerations with extraction date - 'presently incarcerated'
DATA_PULL_DATE = pd.Timestamp('2025-08-11')
df_all_hist['ReleaseDate'] = df_all_hist['ReleaseDate'].fillna(DATA_PULL_DATE)
print(df_all_hist.info(), df_all_hist.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111767 entries, 0 to 1111766
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   DCNumber     1111767 non-null  object        
 1   ReceiptDate  1111767 non-null  datetime64[ns]
 2   ReleaseDate  1111767 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 25.4+ MB
None   DCNumber ReceiptDate ReleaseDate
0   000111  2022-08-09  2023-09-18
1   000111  2024-05-16  2025-08-11
2   000132  2018-10-08  2025-08-11
3   000155  2024-05-23  2025-08-11
4   000155  2018-10-24  2019-03-27

# Code Block 8
df_all_hist = df_all_hist.sort_values(['DCNumber', 'ReceiptDate'])

# Group and produce lists of pd.Intervals
def make_intervals(subdf):
    intervals = []
    for rec, rel in zip(subdf['ReceiptDate'], subdf['ReleaseDate']):
        if pd.notnull(rec) and pd.notnull(rel) and rec <= rel:
            intervals.append(pd.Interval(left=rec, right=rel, closed='both'))
    return intervals

grouped = (
    df_all_hist
    .sort_values(['DCNumber', 'ReceiptDate'])
    .groupby('DCNumber', group_keys=False)[['ReceiptDate', 'ReleaseDate']]
    .apply(make_intervals)
    .reset_index(name='Incar_intervals')
)

print(grouped.head())

  DCNumber  \
0    47880   
1    53946   
2    56050   
3   000001   
4   000004   

                                                                                                                                                                                                                Incar_intervals  
0                                                                                                                                      [[1951-02-12 00:00:00, 1953-09-28 00:00:00], [1954-04-02 00:00:00, 1955-02-19 00:00:00]]  
1                                                                                                                                      [[1954-11-27 00:00:00, 1955-08-17 00:00:00], [1965-06-03 00:00:00, 1965-06-13 00:00:00]]  
2  [[1955-11-09 00:00:00, 1957-08-13 00:00:00], [1957-09-03 00:00:00, 1965-11-09 00:00:00], [1969-02-26 00:00:00, 1971-12-28 00:00:00], [1979-02-08 00:00:00, 1980-11-07 00:00:00], [1980-11-10 00:00:00, 1981-07-14 00:00:00]]  
3                                                                                                                                                                                  [[1960-07-01 00:00:00, 2005-04-24 00:00:00]]  
4                                                                                                                                                                                  [[1995-10-25 00:00:00, 1997-12-11 00:00:00]]

# Code Block 9
df_incarcerated_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_incarcerated_root.info(), df_incarcerated_root.head())

df_released_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_released_root.info(), df_released_root.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90169 entries, 0 to 90168
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   DCNumber               90169 non-null  object  
 1   LastName               90168 non-null  object  
 2   FirstName              90169 non-null  object  
 3   MiddleName             68553 non-null  object  
 4   NameSuffix             7372 non-null   object  
 5   Race                   90169 non-null  object  
 6   Sex                    90169 non-null  category
 7   BirthDate              90169 non-null  object  
 8   PrisonReleaseDate      74057 non-null  object  
 9   ReceiptDate            90169 non-null  object  
 10  releasedateflag_descr  90169 non-null  object  
 11  race_descr             90169 non-null  object  
 12  custody_description    87249 non-null  object  
 13  FACILITY_description   90169 non-null  object  
dtypes: category(1), object(13)
memory usage: 9.0+ MB
None   DCNumber    LastName FirstName MiddleName NameSuffix Race Sex   BirthDate  \
0   000111      NELSON     ALVON          L        NaN    B   M  1998-03-11   
1   000132     TELFAIR   MICHAEL        NaN        NaN    B   M  1967-12-04   
2   000155     LOCKETT     JERRY          I        NaN    B   M  1985-09-26   
3   000175      ELMORE      CHAD          R        NaN    W   M  1982-10-20   
4   000203  RHINESMITH    AUSTIN          A        NaN    W   M  2000-09-12   

     PrisonReleaseDate ReceiptDate releasedateflag_descr race_descr  \
0  2025-09-10 00:00:00  2024-05-16    valid release date      BLACK   
1  2033-04-07 00:00:00  2018-10-08    valid release date      BLACK   
2  2026-07-23 00:00:00  2024-05-23    valid release date      BLACK   
3  2028-08-29 00:00:00  2025-06-11    valid release date      WHITE   
4  2028-01-08 00:00:00  2025-07-24    valid release date      WHITE   

  custody_description FACILITY_description  
0               CLOSE         SUWANNEE C.I  
1             MINIMUM         MADISON C.I.  
2              MEDIUM            GULF C.I.  
3             MINIMUM         JACKSON C.I.  
4                 NaN            CFRC-MAIN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470403 entries, 0 to 470402
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   DCNumber               470403 non-null  object  
 1   LastName               470389 non-null  object  
 2   FirstName              470399 non-null  object  
 3   MiddleName             342540 non-null  object  
 4   NameSuffix             25548 non-null   object  
 5   Race                   470403 non-null  object  
 6   Sex                    470403 non-null  category
 7   BirthDate              470400 non-null  object  
 8   PrisonReleaseDate      470403 non-null  object  
 9   releasedateflag_descr  470403 non-null  object  
 10  race_descr             470403 non-null  object  
 11  custody_description    454906 non-null  object  
dtypes: category(1), object(11)
memory usage: 39.9+ MB
None   DCNumber LastName FirstName MiddleName NameSuffix Race Sex   BirthDate  \
0    47880   MARTIN   WILLARD        NaN        NaN    W   M  1915-06-17   
1    53946   MEARNS   WILLIAM          W        NaN    W   M  1922-06-01   
2    56050   CARVER    JESSIE        NaN        NaN    B   M  1932-10-16   
3   000001  WHITNEY    DENNIS        NaN        NaN    W   M  1942-08-15   
4   000004  ROBBINS     WAYNE        NaN        NaN    W   M  1969-01-13   

  PrisonReleaseDate releasedateflag_descr race_descr custody_description  
0        1955-02-19    valid release date      WHITE                 NaN  
1        1965-06-13    valid release date      WHITE                 NaN  
2        2007-03-20    valid release date      BLACK                 NaN  
3        2005-04-24              deceased      WHITE               CLOSE  
4        1997-12-11    valid release date      WHITE             MINIMUM

# Code Block 10
df_birth_incar = df_incarcerated_root[['DCNumber', 'BirthDate', 'Sex']].copy()
df_birth_released = df_released_root[['DCNumber', 'BirthDate', 'Sex']].copy()

df_birth = pd.concat([df_birth_incar, df_birth_released], ignore_index=True)


df_birth['BirthDate'] = pd.to_datetime(df_birth['BirthDate'])

print("Min birthdate:", df_birth['BirthDate'].min())
print("Max birthdate:", df_birth['BirthDate'].max())
print("Null birthdays:", df_birth['BirthDate'].isna().sum())
print(df_birth[df_birth['BirthDate'].isna()].head())

Min birthdate: 1906-01-14 00:00:00
Max birthdate: 2010-01-19 00:00:00
Null birthdays: 3
       DCNumber BirthDate Sex
252785   969628       NaT   M
360654   K02167       NaT   M
549686   Y13143       NaT   M

# Code Block 11
# Ensure DCNumber is of the same type in both DataFrames
df_birth['DCNumber'] = df_birth['DCNumber'].astype(str).str.strip()
grouped['DCNumber'] = grouped['DCNumber'].astype(str).str.strip()

# Perform a left join:
df_merged = grouped.merge(df_birth, on='DCNumber', how='left')

# Code Block 12
df = df_merged

# Reference dates: June 30 each year
ref_dates = {year: pd.Timestamp(year=year, month=6, day=30) for year in range(2019, 2025)}

# Helpers
def was_incarcerated_on(intervals, on_date):
    if not isinstance(intervals, (list, tuple)) or len(intervals) == 0:
        return False
    for iv in intervals:
        try:
            left, right = iv.left, iv.right
        except AttributeError:
            left, right = iv
        if left <= on_date <= right:
            return True
    return False

def age_on(birth, on_date):
    if pd.isna(birth):
        return pd.NA
    return (on_date.year - birth.year) - ( (on_date.month, on_date.day) < (birth.month, birth.day) )

# Ensure BirthDate is datetime
if not pd.api.types.is_datetime64_any_dtype(df['BirthDate']):
    df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')

# Compute age columns
for year, d in ref_dates.items():
    col = f'age_in_{year}'
    df[col] = df.apply(
        lambda r: age_on(r['BirthDate'], d) if was_incarcerated_on(r['Incar_intervals'], d) else pd.NA,
        axis=1
    ).astype('Int64')
    
# sanity check on total population count
df[['age_in_2024']].count()

age_in_2024    88376
dtype: int64

# Code Block 13
from pandas.api.types import CategoricalDtype

age_bins = [0, 1, 5, 10, 15, 20, 25, 35, 45, 55, 65, 75, 85, 200]
age_labels = [
    "<1", "1-4", "5-9", "10-14", "15-19",
    "20-24", "25-34", "35-44", "45-54", "55-64",
    "65-74", "75-84", "85+"
]
# Remove unused categories up front
if isinstance(df['Sex'].dtype, CategoricalDtype):
    df['Sex'] = df['Sex'].cat.remove_unused_categories()

dfs = {}
for year in range(2019, 2025):
    age_col = f'age_in_{year}'
    bins = pd.cut(df[age_col], bins=age_bins, labels=age_labels, right=False, include_lowest=True)
    temp = pd.DataFrame({'age_bin': bins, 'Sex': df['Sex']})  
    # Group and pivot; observed=True to avoid unused category columns
    group = temp.groupby(['age_bin', 'Sex'], observed=True).size().unstack(fill_value=0)
    # Reset index so 'age_bin' is a column again, and manually clear any index/column names
    group = group.reset_index()
    group.columns.name = None
    group = group.rename(columns={'M': 'Male', 'F': 'Female'})
    dfs[year] = group
    print(f"\nAge and sex distribution for {year}:")
    print(group)

Age and sex distribution for 2019:
  age_bin   Male  Female
0   10-14      2       0
1   15-19    794      32
2   20-24   6733     420
3   25-34  26604    2284
4   35-44  23963    2049
5   45-54  17252    1224
6   55-64  10972     564
7   65-74   3254     123
8   75-84    618      18
9     85+     57       0

Age and sex distribution for 2020:
  age_bin  Female   Male
0   15-19      27    561
1   20-24     341   5428
2   25-34    1872  23421
3   35-44    1820  22493
4   45-54    1115  16071
5   55-64     555  10885
6   65-74     127   3393
7   75-84      20    657
8     85+       1     59

Age and sex distribution for 2021:
  age_bin  Female   Male
0   15-19      18    427
1   20-24     280   4552
2   25-34    1566  20518
3   35-44    1622  21212
4   45-54     934  14828
5   55-64     518  10561
6   65-74     122   3489
7   75-84      17    682
8     85+       2     53

Age and sex distribution for 2022:
  age_bin   Male  Female
0   10-14      1       0
1   15-19    424      23
2   20-24   4334     282
3   25-34  20520    1587
4   35-44  21973    1741
5   45-54  15132     973
6   55-64  10866     531
7   65-74   3791     127
8   75-84    769      21
9     85+     58       1

Age and sex distribution for 2023:
  age_bin  Female   Male
0   15-19      18    480
1   20-24     298   4393
2   25-34    1647  20467
3   35-44    2031  23119
4   45-54    1110  15734
5   55-64     564  11140
6   65-74     138   4135
7   75-84      27    878
8     85+       2     70

Age and sex distribution for 2024:
  age_bin  Female   Male
0   15-19      20    504
1   20-24     309   4359
2   25-34    1706  20028
3   35-44    2149  23926
4   45-54    1210  16350
5   55-64     618  11442
6   65-74     148   4519
7   75-84      26    976
8     85+       4     82

# Code Block 14
df_state = pd.read_csv("fdoc_deaths/csvs/Population_081220251605.csv", header=1)
df_state = df_state.iloc[2:].copy()
df_state = df_state.rename(columns={'Unnamed: 0': 'age_bin'})
df_state = df_state.reset_index(drop=True)

cols = {
    'female_2019': 1, 'male_2019': 2,
    'female_2020': 4, 'male_2020': 5,
    'female_2021': 7, 'male_2021': 8,
    'female_2022':10, 'male_2022':11,
    'female_2023':13, 'male_2023':14,
}

df_age_sex = pd.DataFrame()
df_age_sex['age_bin'] = df_state['age_bin']

# Assign and clean each desired column
for newcol, idx in cols.items():
    df_age_sex[newcol] = (
        df_state.iloc[:, idx]
    )

# Remove any "Total" or extra summary rows
df_age_sex = df_age_sex[~df_age_sex['age_bin'].str.contains("Total", na=False)].copy()

for col in df_age_sex.columns:
    if col != 'age_bin':  # don't convert bin labels!
        df_age_sex[col] = pd.to_numeric(df_age_sex[col])


print(df_age_sex.info(),df_age_sex)

<class 'pandas.core.frame.DataFrame'>
Index: 13 entries, 0 to 12
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age_bin      13 non-null     object
 1   female_2019  13 non-null     int64 
 2   male_2019    13 non-null     int64 
 3   female_2020  13 non-null     int64 
 4   male_2020    13 non-null     int64 
 5   female_2021  13 non-null     int64 
 6   male_2021    13 non-null     int64 
 7   female_2022  13 non-null     int64 
 8   male_2022    13 non-null     int64 
 9   female_2023  13 non-null     int64 
 10  male_2023    13 non-null     int64 
dtypes: int64(10), object(1)
memory usage: 1.2+ KB
None    age_bin  female_2019  male_2019  female_2020  male_2020  female_2021  \
0       <1       105837     110740       100144     105001       103515   
1      1-4       452146     473774       461835     483003       459749   
2      5-9       565917     590432       574982     600993       583720   
3    10-14       590585     614014       595752     619774       601845   
4    15-19       588355     616780       592774     619696       594807   
5    20-24       620093     649481       621957     648074       629862   
6    25-34      1370805    1417463      1388821    1437525      1403076   
7    35-44      1294285    1273377      1321951    1302602      1349385   
8    45-54      1398322    1345694      1393406    1339768      1390803   
9    55-64      1500802    1348036      1533563    1384124      1561551   
10   65-74      1286140    1103480      1334870    1141115      1386162   
11   75-84       755539     627404       795562     659897       830232   
12     85+       342951     226101       348827     234750       357797   

    male_2021  female_2022  male_2022  female_2023  male_2023  
0      108035       108250     112813       106527     111631  
1      478846       446345     465464       446903     465106  
2      610256       592247     619869       591461     619054  
3      624646       624767     654269       613277     643749  
4      623517       617216     648019       632772     663571  
5      651645       633981     651718       678944     699869  
6     1454968      1401101    1438657      1405448    1460442  
7     1335878      1388739    1400465      1407017    1437365  
8     1337748      1398209    1374181      1396753    1385056  
9     1415552      1578259    1473402      1569641    1477506  
10    1180546      1418697    1231516      1418872    1227549  
11     687257       818514     684502       884067     731385  
12     244189       324276     223702       371580     240038

# Code Block 15
years = range(2019, 2024)
merged_dfs = {}

for year in years:
    # Get FDOC population by sex/age for the year
    df_fdoc = dfs[year].copy()
    # Get state population columns for the year
    mcol = f'male_{year}'
    fcol = f'female_{year}'
    # Merge on 'age_bin'
    merged = df_fdoc.merge(
        df_age_sex[['age_bin', fcol, mcol]], on='age_bin', how='left'
    )
    # Rename columns
    merged = merged.rename(columns={
        'Male': f'fdoc_male_{year}',
        'Female': f'fdoc_female_{year}',
        fcol: f'flpop_female_{year}',
        mcol: f'flpop_male_{year}'
    })
    # Put columns in standard order
    merged = merged[['age_bin', f'fdoc_male_{year}', f'fdoc_female_{year}', f'flpop_male_{year}', f'flpop_female_{year}']]
    merged_dfs[year] = merged

    # Preview result
    print(f"\nMerged distribution for {year}:")
    print(merged)

Merged distribution for 2019:
  age_bin  fdoc_male_2019  fdoc_female_2019  flpop_male_2019  \
0   10-14               2                 0           614014   
1   15-19             794                32           616780   
2   20-24            6733               420           649481   
3   25-34           26604              2284          1417463   
4   35-44           23963              2049          1273377   
5   45-54           17252              1224          1345694   
6   55-64           10972               564          1348036   
7   65-74            3254               123          1103480   
8   75-84             618                18           627404   
9     85+              57                 0           226101   

   flpop_female_2019  
0             590585  
1             588355  
2             620093  
3            1370805  
4            1294285  
5            1398322  
6            1500802  
7            1286140  
8             755539  
9             342951  

Merged distribution for 2020:
  age_bin  fdoc_male_2020  fdoc_female_2020  flpop_male_2020  \
0   15-19             561                27           619696   
1   20-24            5428               341           648074   
2   25-34           23421              1872          1437525   
3   35-44           22493              1820          1302602   
4   45-54           16071              1115          1339768   
5   55-64           10885               555          1384124   
6   65-74            3393               127          1141115   
7   75-84             657                20           659897   
8     85+              59                 1           234750   

   flpop_female_2020  
0             592774  
1             621957  
2            1388821  
3            1321951  
4            1393406  
5            1533563  
6            1334870  
7             795562  
8             348827  

Merged distribution for 2021:
  age_bin  fdoc_male_2021  fdoc_female_2021  flpop_male_2021  \
0   15-19             427                18           623517   
1   20-24            4552               280           651645   
2   25-34           20518              1566          1454968   
3   35-44           21212              1622          1335878   
4   45-54           14828               934          1337748   
5   55-64           10561               518          1415552   
6   65-74            3489               122          1180546   
7   75-84             682                17           687257   
8     85+              53                 2           244189   

   flpop_female_2021  
0             594807  
1             629862  
2            1403076  
3            1349385  
4            1390803  
5            1561551  
6            1386162  
7             830232  
8             357797  

Merged distribution for 2022:
  age_bin  fdoc_male_2022  fdoc_female_2022  flpop_male_2022  \
0   10-14               1                 0           654269   
1   15-19             424                23           648019   
2   20-24            4334               282           651718   
3   25-34           20520              1587          1438657   
4   35-44           21973              1741          1400465   
5   45-54           15132               973          1374181   
6   55-64           10866               531          1473402   
7   65-74            3791               127          1231516   
8   75-84             769                21           684502   
9     85+              58                 1           223702   

   flpop_female_2022  
0             624767  
1             617216  
2             633981  
3            1401101  
4            1388739  
5            1398209  
6            1578259  
7            1418697  
8             818514  
9             324276  

Merged distribution for 2023:
  age_bin  fdoc_male_2023  fdoc_female_2023  flpop_male_2023  \
0   15-19             480                18           663571   
1   20-24            4393               298           699869   
2   25-34           20467              1647          1460442   
3   35-44           23119              2031          1437365   
4   45-54           15734              1110          1385056   
5   55-64           11140               564          1477506   
6   65-74            4135               138          1227549   
7   75-84             878                27           731385   
8     85+              70                 2           240038   

   flpop_female_2023  
0             632772  
1             678944  
2            1405448  
3            1407017  
4            1396753  
5            1569641  
6            1418872  
7             884067  
8             371580

# Code Block 16
df = merged_dfs[2023].copy()
for col in ['fdoc_male_2023', 'fdoc_female_2023', 'flpop_male_2023', 'flpop_female_2023']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['fdoc_female_2023_neg'] = -df['fdoc_female_2023']
df['flpop_female_2023_neg'] = -df['flpop_female_2023']
df['age_bin'] = pd.Categorical(df['age_bin'], categories=df['age_bin'].tolist(), ordered=True)

# Global config for dark mode and bar gaps
alt.themes.enable('none')
custom_config = {
    "config": {
        "background": 'rgba(0,0,0,0)',
        "axis": {
            "domainColor": "white",
            "gridColor": "white",
            "labelColor": "white",
            "tickColor": "white",
            "titleColor": "white"
        },
        "title": {"color": "white"},
        "legend": {"labelColor": "white", "titleColor": "white"},
        "view": {"stroke": "white"},
        "bar": {"binSpacing": 0}
    }
}
alt.themes.register('johnwise_black', lambda: custom_config)
alt.themes.enable('johnwise_black')

#### --- FDOC Chart ---
base_fdoc = alt.Chart(df).encode(
    y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)

male_fdoc = base_fdoc.mark_bar(color="#348ABD", size = 35).encode(
    x=alt.X('fdoc_male_2023:Q', title='FDOC Male'),
    tooltip=['age_bin', alt.Tooltip('fdoc_male_2023')]
)
female_fdoc = base_fdoc.mark_bar(color="#E24A33", size = 35).encode(
    x=alt.X('fdoc_female_2023_neg:Q', title='FDOC Female',
            axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
    tooltip=['age_bin', alt.Tooltip('fdoc_female_2023')]
)

pyramid_fdoc = (female_fdoc + male_fdoc).properties(
    title=alt.TitleParams('FDOC 2023 Prison Population Pyramid', color='white'),
    width=260, height=320
)

#### --- Statewide Chart ---
base_state = alt.Chart(df).encode(
    y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)
male_state = base_state.mark_bar(color="#348ABD", size = 35).encode(
    x=alt.X('flpop_male_2023:Q', title='FL Male'),
    tooltip=['age_bin', alt.Tooltip('flpop_male_2023')]
)
female_state = base_state.mark_bar(color="#E24A33", size = 35).encode(
    x=alt.X('flpop_female_2023_neg:Q', title='FL Female',
            axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
    tooltip=['age_bin', alt.Tooltip('flpop_female_2023')]
)
pyramid_state = (female_state + male_state).properties(
    title=alt.TitleParams('Florida 2023 Population Pyramid', color='white'),
    width=260, height=320
)


chart = alt.vconcat(
    pyramid_fdoc,
    pyramid_state
)
chart.save("assets/double_pop_pyramid.json")
chart

# Code Block 17
df19 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2019-2020.csv")
df20 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2020-2021.csv")
df21 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2021-2022.csv")
df22 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2022-2023.csv")
df23 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2023-2024.csv")
df_all_FY = pd.concat([df19, df20, df21, df22, df23], ignore_index = True)
print(df_all_FY.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Name                              2148 non-null   object
 1   DC Number                         2148 non-null   object
 2   Date of Death                     2260 non-null   object
 3   Institution Name                  2260 non-null   object
 4   Manner of Death Determined by ME  2260 non-null   object
 5   Investigative Status              2260 non-null   object
dtypes: object(6)
memory usage: 106.1+ KB
None

# Code Block 18
df_deaths_19_23 = df_all_FY[['Name', 'DC Number', 'Date of Death']].copy()
# Rename 'DC Number' to 'DCNumber'
df_deaths_19_23.rename(columns={'DC Number': 'DCNumber'}, inplace=True)
df_deaths_19_23['Date of Death'] = pd.to_datetime(df_deaths_19_23['Date of Death'], format = '%m/%d/%Y')
print(df_deaths_19_23.head(), df_deaths_19_23.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2148 non-null   object        
 1   DCNumber       2148 non-null   object        
 2   Date of Death  2260 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 53.1+ KB
              Name DCNumber Date of Death
0              NaN      NaN    2019-07-01
1  HORTON, MICHAEL   073661    2019-07-02
2              NaN      NaN    2019-07-05
3  FERGUSON, LEROY   716169    2019-07-06
4  BUSSER, WILLIAM   N26218    2019-07-06 None

# Code Block 19
df_deaths_births_19_23 = pd.merge(df_deaths_19_23, df_birth, on='DCNumber', how='left')
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2148 non-null   object        
 1   DCNumber       2148 non-null   object        
 2   Date of Death  2260 non-null   datetime64[ns]
 3   BirthDate      2145 non-null   datetime64[ns]
 4   Sex            2145 non-null   object        
dtypes: datetime64[ns](2), object(3)
memory usage: 88.4+ KB
None               Name DCNumber Date of Death  BirthDate  Sex
0              NaN      NaN    2019-07-01        NaT  NaN
1  HORTON, MICHAEL   073661    2019-07-02 1952-07-19    M
2              NaN      NaN    2019-07-05        NaT  NaN
3  FERGUSON, LEROY   716169    2019-07-06 1960-11-29    M
4  BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17    M

# Code Block 20
missing_birthdate_rows = df_deaths_births_19_23[
    (df_deaths_births_19_23['DCNumber'].notna()) &
    (df_deaths_births_19_23['BirthDate'].isna())
]
print(missing_birthdate_rows)
missing_birthdate_rows2 = df_deaths_births_19_23[
    (df_deaths_births_19_23['BirthDate'].notna()) &
    (df_deaths_births_19_23['DCNumber'].isna())
]
print(missing_birthdate_rows2)

                  Name DCNumber Date of Death BirthDate  Sex
799    BAILEY, ROMULUS   168749    2021-01-05       NaT  NaN
1551     JORDAN, JAMES   555250    2022-11-02       NaT  NaN
2032  CHEATHAM, GERALD   083980    2023-11-28       NaT  NaN
Empty DataFrame
Columns: [Name, DCNumber, Date of Death, BirthDate, Sex]
Index: []

# Code Block 21
adj = 1- df_deaths_births_19_23[df_deaths_births_19_23['BirthDate'].isna()].shape[0]/df_all_FY.shape[0]
print(adj)

0.9491150442477876

# Code Block 22
df_deaths_births_19_23.dropna(subset =['BirthDate'], inplace = True)
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())

<class 'pandas.core.frame.DataFrame'>
Index: 2145 entries, 1 to 2258
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2145 non-null   object        
 1   DCNumber       2145 non-null   object        
 2   Date of Death  2145 non-null   datetime64[ns]
 3   BirthDate      2145 non-null   datetime64[ns]
 4   Sex            2145 non-null   object        
dtypes: datetime64[ns](2), object(3)
memory usage: 100.5+ KB
None                Name DCNumber Date of Death  BirthDate Sex
1   HORTON, MICHAEL   073661    2019-07-02 1952-07-19   M
3   FERGUSON, LEROY   716169    2019-07-06 1960-11-29   M
4   BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17   M
5      GIGLIO, DUKE   053611    2019-07-06 1954-12-16   M
6  COPPERTINO, JOHN   341241    2019-07-07 1952-05-21   M

# Code Block 23
df_deaths_births_19_23['age_at_death'] = df_deaths_births_19_23.apply(
    lambda r: age_on(r['BirthDate'], r['Date of Death']),
    axis=1
).astype('Int64')
print(df_deaths_births_19_23.head())

               Name DCNumber Date of Death  BirthDate Sex  age_at_death
1   HORTON, MICHAEL   073661    2019-07-02 1952-07-19   M            66
3   FERGUSON, LEROY   716169    2019-07-06 1960-11-29   M            58
4   BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17   M            81
5      GIGLIO, DUKE   053611    2019-07-06 1954-12-16   M            64
6  COPPERTINO, JOHN   341241    2019-07-07 1952-05-21   M            67

# Code Block 24
deaths_by_year = {}

for year in range(2019, 2024):
    mask = df_deaths_births_19_23['Date of Death'].dt.year == year
    df_year = df_deaths_births_19_23[mask].copy()
    df_year['age_bin'] = pd.cut(
        df_year['age_at_death'], 
        bins=age_bins, 
        labels=age_labels, 
        right=False, 
        include_lowest=True
    )
    grouped = (
        df_year.groupby(['age_bin', 'Sex'], observed=True)
        .size()
        .unstack(fill_value=0)
        .reset_index()
    )
    grouped.columns.name = None
    grouped = grouped.rename(columns={'M': f'male_deaths_{year}', 'F': f'female_deaths_{year}'})

    deaths_by_year[year] = grouped

    print(f"\nDeaths by age and sex, {year}:")
    print(grouped)

Deaths by age and sex, 2019:
  age_bin  female_deaths_2019  male_deaths_2019
0   25-34                   1                11
1   35-44                   1                14
2   45-54                   1                30
3   55-64                   1                46
4   65-74                   0                52
5   75-84                   1                27
6     85+                   0                 3

Deaths by age and sex, 2020:
  age_bin  female_deaths_2020  male_deaths_2020
0   20-24                   0                 2
1   25-34                   3                18
2   35-44                   4                47
3   45-54                   1                63
4   55-64                   6               146
5   65-74                   5               171
6   75-84                   3                86
7     85+                   0                17

Deaths by age and sex, 2021:
  age_bin  female_deaths_2021  male_deaths_2021
0   25-34                   1                18
1   35-44                   2                34
2   45-54                   4                48
3   55-64                   1               131
4   65-74                   2               108
5   75-84                   1                59
6     85+                   0                11

Deaths by age and sex, 2022:
  age_bin  female_deaths_2022  male_deaths_2022
0   25-34                   1                20
1   35-44                   3                31
2   45-54                   1                53
3   55-64                   0               103
4   65-74                   2                95
5   75-84                   0                52
6     85+                   0                 8

Deaths by age and sex, 2023:
  age_bin  female_deaths_2023  male_deaths_2023
0   20-24                   1                 4
1   25-34                   1                26
2   35-44                   1                35
3   45-54                   1                54
4   55-64                   4               108
5   65-74                   4               101
6   75-84                   3                65
7     85+                   0                19

#Code Block 25
final_merged = {}

for year in range(2019, 2024):
    popdf = merged_dfs[year]
    deathsdf = deaths_by_year[year]
    merged = popdf.merge(deathsdf, on='age_bin', how='left')
    for sexcol in [f'male_deaths_{year}', f'female_deaths_{year}']:
        if sexcol in merged.columns:
            merged[sexcol] = merged[sexcol].fillna(0).astype(int)
    final_merged[year] = merged
    print(f"\nMerged table for {year}:")
    print(merged)

Merged table for 2019:
  age_bin  fdoc_male_2019  fdoc_female_2019  flpop_male_2019  \
0   10-14               2                 0           614014   
1   15-19             794                32           616780   
2   20-24            6733               420           649481   
3   25-34           26604              2284          1417463   
4   35-44           23963              2049          1273377   
5   45-54           17252              1224          1345694   
6   55-64           10972               564          1348036   
7   65-74            3254               123          1103480   
8   75-84             618                18           627404   
9     85+              57                 0           226101   

   flpop_female_2019  female_deaths_2019  male_deaths_2019  
0             590585                   0                 0  
1             588355                   0                 0  
2             620093                   0                 0  
3            1370805                   1                11  
4            1294285                   1                14  
5            1398322                   1                30  
6            1500802                   1                46  
7            1286140                   0                52  
8             755539                   1                27  
9             342951                   0                 3  

Merged table for 2020:
  age_bin  fdoc_male_2020  fdoc_female_2020  flpop_male_2020  \
0   15-19             561                27           619696   
1   20-24            5428               341           648074   
2   25-34           23421              1872          1437525   
3   35-44           22493              1820          1302602   
4   45-54           16071              1115          1339768   
5   55-64           10885               555          1384124   
6   65-74            3393               127          1141115   
7   75-84             657                20           659897   
8     85+              59                 1           234750   

   flpop_female_2020  female_deaths_2020  male_deaths_2020  
0             592774                   0                 0  
1             621957                   0                 2  
2            1388821                   3                18  
3            1321951                   4                47  
4            1393406                   1                63  
5            1533563                   6               146  
6            1334870                   5               171  
7             795562                   3                86  
8             348827                   0                17  

Merged table for 2021:
  age_bin  fdoc_male_2021  fdoc_female_2021  flpop_male_2021  \
0   15-19             427                18           623517   
1   20-24            4552               280           651645   
2   25-34           20518              1566          1454968   
3   35-44           21212              1622          1335878   
4   45-54           14828               934          1337748   
5   55-64           10561               518          1415552   
6   65-74            3489               122          1180546   
7   75-84             682                17           687257   
8     85+              53                 2           244189   

   flpop_female_2021  female_deaths_2021  male_deaths_2021  
0             594807                   0                 0  
1             629862                   0                 0  
2            1403076                   1                18  
3            1349385                   2                34  
4            1390803                   4                48  
5            1561551                   1               131  
6            1386162                   2               108  
7             830232                   1                59  
8             357797                   0                11  

Merged table for 2022:
  age_bin  fdoc_male_2022  fdoc_female_2022  flpop_male_2022  \
0   10-14               1                 0           654269   
1   15-19             424                23           648019   
2   20-24            4334               282           651718   
3   25-34           20520              1587          1438657   
4   35-44           21973              1741          1400465   
5   45-54           15132               973          1374181   
6   55-64           10866               531          1473402   
7   65-74            3791               127          1231516   
8   75-84             769                21           684502   
9     85+              58                 1           223702   

   flpop_female_2022  female_deaths_2022  male_deaths_2022  
0             624767                   0                 0  
1             617216                   0                 0  
2             633981                   0                 0  
3            1401101                   1                20  
4            1388739                   3                31  
5            1398209                   1                53  
6            1578259                   0               103  
7            1418697                   2                95  
8             818514                   0                52  
9             324276                   0                 8  

Merged table for 2023:
  age_bin  fdoc_male_2023  fdoc_female_2023  flpop_male_2023  \
0   15-19             480                18           663571   
1   20-24            4393               298           699869   
2   25-34           20467              1647          1460442   
3   35-44           23119              2031          1437365   
4   45-54           15734              1110          1385056   
5   55-64           11140               564          1477506   
6   65-74            4135               138          1227549   
7   75-84             878                27           731385   
8     85+              70                 2           240038   

   flpop_female_2023  female_deaths_2023  male_deaths_2023  
0             632772                   0                 0  
1             678944                   1                 4  
2            1405448                   1                26  
3            1407017                   1                35  
4            1396753                   1                54  
5            1569641                   4               108  
6            1418872                   4               101  
7             884067                   3                65  
8             371580                   0                19

# Code Block 26
for year in range(2019, 2024):
    for sex in ['male', 'female']:
        deaths_col = f"{sex}_deaths_{year}"
        pop_col = f"fdoc_{sex}_{year}"
        flpop_col = f"flpop_{sex}_{year}"
        proj_deaths_col = f"{sex}_deaths_proj"
        final_merged[year][proj_deaths_col] = (
            final_merged[year][deaths_col] / final_merged[year][pop_col] * final_merged[year][flpop_col] * adj
        )
# sanity check
final_merged[2020]

# Code Block 27
years = range(2019, 2024)
results = []

for year in years:
    df = final_merged[year]
    total_deaths_proj = df['male_deaths_proj'].sum() + df['female_deaths_proj'].sum()
    total_population = df[f'flpop_male_{year}'].sum() + df[f'flpop_female_{year}'].sum()
    yearly_rate = round(total_deaths_proj / total_population * 100000, 2)
    results.append({'year': year, 'AADR': yearly_rate})

# Output DataFrame with rates per year
rates_df = pd.DataFrame(results)
print(rates_df)

# Mean across the 5 years
mean_aadr = round(rates_df['AADR'].mean(), 2)
print(f"\nMean AADR (2019–2023): {mean_aadr}")

   year     AADR
0  2019   566.76
1  2020  2277.23
2  2021  1313.25
3  2022   746.67
4  2023  1633.91

Mean AADR (2019–2023): 1307.56

IMG_PATH = "assets/Screenshot 2025-08-17 at 4.23.46 PM.png"

# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Population/Count"
QUERY_TEXT = "Crude Death Rate by Year, All causes"
RETRIEVED = "2025-08-17 at 22.23.46 UTC"

# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))

# Caption below the image
display(Markdown(
    f"Source: [FDOH data portal]({FDOH_URL})  \n"
    f"Query: {QUERY_TEXT}  \n"
    f"Retrieved: {RETRIEVED}"
))

# Code Block 29
df_fl_crude = pd.read_csv('fdoc_deaths/csvs/FDOH_Crude_death_rate.csv', header =2)
#print(df_fl_crude.info(), df_fl_crude.head())

# Reshape df_fl_crude from wide to long (pivot)
df_fl_crude_long = df_fl_crude.drop(columns=['Unnamed: 10'], errors='ignore').melt(
    var_name='year', value_name='FL_Crude_Death_Rate'
)

# Filter to years of interest and ensure type
years = [2019, 2020, 2021, 2022, 2023]
df_fl_crude_long = df_fl_crude_long[df_fl_crude_long['year'].astype(int).isin(years)].copy()
df_fl_crude_long['year'] = df_fl_crude_long['year'].astype(int)

# Merge with rates dataframe
combined = rates_df.merge(df_fl_crude_long, on='year', how='left')

print(combined)

   year     AADR  FL_Crude_Death_Rate
0  2019   566.76                973.2
1  2020  2277.23               1106.2
2  2021  1313.25               1187.2
3  2022   746.67               1070.1
4  2023  1633.91               1009.7

# Code Block 30
df_bar = combined.copy()
df_bar['Series'] = 'FDOC AADR'
df_bar = df_bar.rename(columns={'AADR': 'value'})

df_line = combined.copy()
df_line['Series'] = 'FL Crude Death Rate'
df_line = df_line.rename(columns={'FL_Crude_Death_Rate': 'value'})

long_df = pd.concat([df_bar[['year','Series','value']], df_line[['year','Series','value']]], ignore_index=True)

color_scale = alt.Scale(
    domain=['FDOC AADR', 'FL Crude Death Rate'],
    range=['#4bb4e6', '#ffa600']
)

base = alt.Chart(long_df).encode(
    x=alt.X('year:O', title='Year')
)

bars = base.transform_filter(
    alt.datum.Series == 'FDOC AADR'
).mark_bar(
    size=40,
    opacity=0.85,
    cornerRadiusTopLeft=8,
    cornerRadiusTopRight=8
).encode(
    y=alt.Y('value:Q', title='Death Rate (per 100,000)'),
    color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FDOC AADR')
    ]
)

line = base.transform_filter(
    alt.datum.Series == 'FL Crude Death Rate'
).mark_line(
    interpolate='monotone',
    strokeWidth=3
).encode(
    y='value:Q',
    color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
    ]
)

points = base.transform_filter(
    alt.datum.Series == 'FL Crude Death Rate'
).mark_circle(size=120).encode(
    y='value:Q',
    color=alt.Color('Series:N', scale=color_scale, legend=None),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
    ]
)

mean_aadr = round(combined['AADR'].mean(), 2)
mean_rule = alt.Chart(pd.DataFrame({'mean_aadr': [mean_aadr]})).mark_rule(
    color='red', strokeDash=[8,4], size=3
).encode(
    y='mean_aadr:Q',
    tooltip=[alt.Tooltip('mean_aadr:Q', format='.2f', title='Mean FDOC AADR')]
)

final_plot = (bars + line + points).properties(
    width=500,
    height=320,
    title=alt.TitleParams(
        'Florida Statewide Death Rate vs. FDOC Age Adjusted Death Rate (2019–2023)', 
        color='white'
    )
)

final_plot.save("assets/FDOC_AADR_vs_FL_Crude.json")
final_plot

Code Block 1 - Retrieve the table of inmate mortality from the FDOC Website¶

Code Block 2 - Retrieve Statewide Homicide Data from the FDOH¶

Code Block 3 - Utilize and transform the FDOC Tabular data¶

External Data Source¶

Code Block 4 - Generate Homicde Rate Visualization¶

Code Block 5 - Bringing in the raw OBIS data¶

Using the Incarceration Data¶

Code Block 6-8 - Determine who was incarcerated when¶

Code Block 7 - data wrangling¶

Code Block 8 - Compiling incarceration intervals¶

Code Block 9 - Bringing in Age/Sex Data¶

Code Block 10 - Data Wrangling Age/Sex Data¶

Null birthdays verified null in https://pubapps.fdc.myflorida.com/OffenderSearch/InmateInfoMenu.aspx ¶

Code Block 11 - Associate the age/sex data with the intervals of incarceration for each person¶

Code Block 12-13 - Get counts per-year per-population group¶

Code Block 12¶

Code Block 13¶

Code Block 14 - Pull in FDOH data on state population¶

Code Block 15 - Combine Statewide population counts with FODC incarcerated counts per year¶

Code Block 16 - Generate Population Pyramids Visualiztion for 2023¶

Code Block 17 - Bring in individual death data¶

Code Block 18 - Data Wrangling¶

Code Block 19 - Join individual death data to previous datatable containing all birthdays/sex on record¶

Code Block 20 - Investigate completeness¶

Code Bock 21 -- Compute the adj adjustment metric¶

Code Block 22 - Drop death entries for which no age data can be determined¶

Code Block 23 Compute age at death for all death entries¶

Code Block 24 - Compute male and female deaths per year by age group¶

Code Block 25 - Merge the previously computed death data with the previous population data¶

Compute Age Adjusted Death Rates¶

Code Block 26¶

Code Block 27 - Final Computation of FDOC AADR¶

Code Block 28 - Statewide Crude Death Rate¶

Code Block 29¶

Code Block 30 - Building the Visualization: Florida Statewide Death Rate vs. FDOC Age Adjusted Death Rate (2019–2023)¶

	age_bin	fdoc_male_2020	fdoc_female_2020	flpop_male_2020	flpop_female_2020	female_deaths_2020	male_deaths_2020	male_deaths_proj	female_deaths_proj
0	15-19	561	27	619696	592774	0	0	0.000000	0.000000
1	20-24	5428	341	648074	621957	0	2	226.638461	0.000000
2	25-34	23421	1872	1437525	1388821	3	18	1048.579432	2112.421322
3	35-44	22493	1820	1302602	1321951	4	47	2583.337051	2757.546334
4	45-54	16071	1115	1339768	1393406	1	63	4984.781269	1186.100984
5	55-64	10885	555	1384124	1533563	6	146	17620.502074	15735.434752
6	65-74	3393	127	1141115	1334870	5	171	54583.392203	49879.732249
7	75-84	657	20	659897	795562	3	86	81983.809209	113261.979425
8	85+	59	1	234750	348827	0	17	64197.980726	0.000000

Code Block 1 - Retrieve the table of inmate mortality from the FDOC Website¶

Code Block 2 - Retrieve Statewide Homicide Data from the FDOH¶

Code Block 3 - Utilize and transform the FDOC Tabular data¶

External Data Source¶

Code Block 4 - Generate Homicde Rate Visualization¶

Code Block 5 - Bringing in the raw OBIS data¶

Using the Incarceration Data¶

Code Block 6-8 - Determine who was incarcerated when¶

Code Block 7 - data wrangling¶

Code Block 8 - Compiling incarceration intervals¶

Code Block 9 - Bringing in Age/Sex Data¶

Code Block 10 - Data Wrangling Age/Sex Data¶

Null birthdays verified null in https://pubapps.fdc.myflorida.com/OffenderSearch/InmateInfoMenu.aspx¶

Code Block 11 - Associate the age/sex data with the intervals of incarceration for each person¶

Code Block 12-13 - Get counts per-year per-population group¶

Code Block 12¶

Code Block 13¶

Code Block 14 - Pull in FDOH data on state population¶

Code Block 15 - Combine Statewide population counts with FODC incarcerated counts per year¶

Code Block 16 - Generate Population Pyramids Visualiztion for 2023¶

Code Block 17 - Bring in individual death data¶

Code Block 18 - Data Wrangling¶

Code Block 19 - Join individual death data to previous datatable containing all birthdays/sex on record¶

Code Block 20 - Investigate completeness¶

Code Bock 21 -- Compute the adj adjustment metric¶

Code Block 22 - Drop death entries for which no age data can be determined¶

Code Block 23 Compute age at death for all death entries¶

Code Block 24 - Compute male and female deaths per year by age group¶

Code Block 25 - Merge the previously computed death data with the previous population data¶

Compute Age Adjusted Death Rates¶

Code Block 26¶

Code Block 27 - Final Computation of FDOC AADR¶

Code Block 28 - Statewide Crude Death Rate¶

Code Block 29¶

Code Block 30 - Building the Visualization: Florida Statewide Death Rate vs. FDOC Age Adjusted Death Rate (2019–2023)¶

Null birthdays verified null in https://pubapps.fdc.myflorida.com/OffenderSearch/InmateInfoMenu.aspx ¶