# import necessary libraries
from google.cloud import bigquery
import pandas as pd
import numpy as np
import datetime as dt
from dateutil import relativedelta as rd
import plotly.express as px

# build client to connect to open-baltimore project
project_id = "open-baltimore-data"
client = bigquery.Client(project=project_id)


# import employee name info
employee_salaries_query = client.query(
    """
    SELECT 
        info.employeeSlug,
        main.objectId,
        TRIM(REGEXP_REPLACE(main.agencyName, r'\(.*?\)', '')) as agencyName,
        main.agencyId,
        main.annualSalary,
        main.fiscalYear,
        main.hireDate
    FROM city_employee_salaries.main as main
    LEFT JOIN city_employee_salaries.employee_info as info
    ON main.ObjectId = info.ObjectId
    """
)
employee_salaries = employee_salaries_query.result().to_dataframe()


# clean fields for analysis
agency_dict =  {
    "A01": "Mayors Office",
    "A02": "City Council",
    "A02": "Mayors OED",
    "A04": "Rec & Parks",
    "A05": "MONSE",
    "A06": "Housing & Community Dev",
    "A08": "M-R Human Services",
    "A09": "Liquor License Board",
    "A10": "Mayors Office of Children & Families",
    "A11": "Office of the Inspector General",
    "A12": "Finance",
    "A14": "Finance",
    "A15": "Comptrollers Office",
    "A16": "Comptrollers Office",
    "A17": "Finance",
    "A18": "Finance",
    "A19": "City Planning",
    "A23": "Finance",
    "A24": "Comptroller - Audits",
    "A26": "M-R Labor Commissioner",
    "A28": "Wage Commissioner",
    "A29": "States Attorneys Office",
    "A30": "Law Department",
    "A31": "Circuit Court",
    "A32": "Finance",
    "A33": "Legislative Reference",
    "A35": "Elections",
    "A37": "Orphans Court",
    "A38": "Sheriffs Office",
    "A39": "311",
    "A40": "BCIT",
    "A41": "DPW - Admin",
    "A44": "M-R Cable & Comms",
    "A46": "Environmental Control Board",
    "A49": "Transportation",
    "A50": "DPW - Waste & Wastewater",
    "A51": "Office of Equity & Civil Rights",
    "A52": "Employee Retirement System",
    "A53": "Finance",
    "A54": "Retirement - Fire & Police",
    "A57": "City Council Services",
    "A64": "Fire Department",
    "A65": "Health Department",
    "A67": "Rec & Parks",
    "A68": "Rec & Parks",
    "A70": "DPW - Solid Waste",
    "A73": "Municipal Zoning & Appeals",
    "A75": "Enoch Pratt Free Library",
    "A83": "Human Resources",
    "A84": "Transportation",
    "A85": "General Services",
    "A86": "War Memorial Commission",
    "A88": "Comptroller - Comms",
    "A90": "Transportation",
    "A91": "Convention Center",
    "A99": "Police Department",
    "A9": "Police Department",
    "B49": "Transportation",
    "B68": "Rec & Parks",
    "B70": "DPW - Solid Waste",
    "BPD": "Police Department",
    "C90": "Transportation - Crossing Guards",
    "P04": "Rec & Parks",
    "P65": "Health Department",
    "P83": "HR Test Monitor",
    "R01": "R01",
    "U01": "U01",
    "SCS": "Special City Services",
    "W02": "Youth Summer Works",
    "W03": "Youth Cust",
    "W07": "Youth Temp Adult",
    "W08": "TANF Cust"
}
employee_salaries_clean = employee_salaries.copy(deep=True)
employee_salaries_clean = employee_salaries_clean[employee_salaries["annualSalary"].notnull()
                                      & employee_salaries["annualSalary"] != 0]
employee_salaries_clean['cleanAgencyName'] = employee_salaries_clean['agencyId'].map(agency_dict)
employee_salaries_clean = employee_salaries_clean[employee_salaries_clean["hireDate"].notnull()]
employee_salaries_clean['hireDate'] = pd.to_datetime(employee_salaries_clean["hireDate"], unit="ms")
# may need to implement check and ensure tenure across unique employees is the same
employee_salaries_clean['tenure'] = \
    employee_salaries_clean['hireDate'].map(lambda hire_date: rd.relativedelta(dt.datetime.now(), hire_date).years)


# count number of records per city agency
agency_record_count = employee_salaries_clean.groupby(["cleanAgencyName"], as_index=False).agg(
    salaryRecords=pd.NamedAgg(column="cleanAgencyName", aggfunc="count")
    )
agencies_with_500_records = agency_record_count[agency_record_count["salaryRecords"] >= 500]["cleanAgencyName"].values.tolist()   
# calculate mean salary, max salary, min salary, growth, salary records, tenure, average raise by employee and agency
employee_salary_quality = employee_salaries_clean.groupby(["employeeSlug", "cleanAgencyName"], as_index=False).agg(
    medSalary=pd.NamedAgg(column="annualSalary", aggfunc="median"),
    highestSalary=pd.NamedAgg(column="annualSalary", aggfunc="max"),
    lowestSalary=pd.NamedAgg(column="annualSalary", aggfunc="min"),
    growth=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: max(salary) - min(salary)),
    salaryRecords=pd.NamedAgg(column="annualSalary", aggfunc="nunique"),
    tenure=pd.NamedAgg(column="tenure", aggfunc="first"),
    avgRaise=pd.NamedAgg(column="annualSalary", aggfunc=lambda salary: (max(salary) - min(salary)) / len(salary))
    )
agency_salary_quality = employee_salary_quality.groupby(["cleanAgencyName"], as_index=False).agg(
    medSalary=pd.NamedAgg(column="medSalary", aggfunc="median"),
    medHighestSalary=pd.NamedAgg(column="highestSalary", aggfunc="median"),
    medLowestSalary=pd.NamedAgg(column="lowestSalary", aggfunc="median"),
    medGrowth=pd.NamedAgg(column="growth", aggfunc="median"),
    medTenure=pd.NamedAgg(column="tenure", aggfunc="median"),
    medRaise=pd.NamedAgg(column="avgRaise", aggfunc="median")
    ).query("cleanAgencyName in @agencies_with_500_records")
agency_salary_quality.style.hide(axis="index")


# add salary box plot
ordered_salary_data = employee_salaries_clean.query("cleanAgencyName in @agencies_with_500_records").loc[:, ["cleanAgencyName", "annualSalary"]] \
    .groupby(["cleanAgencyName"]) \
    .median() \
    .sort_values(by="annualSalary", ascending=False)
fig = px.box(employee_salaries_clean.query("cleanAgencyName in @agencies_with_500_records"), 
             x="cleanAgencyName", 
             y="annualSalary", 
             points=False, 
             category_orders={"cleanAgencyName": ordered_salary_data.index.to_list()},
             labels=dict(cleanAgencyName="City Agency", annualSalary = "Annual Salary ($)")
             )
fig.show()

cleanAgencyName	medSalary	medHighestSalary	medLowestSalary	medGrowth	medTenure	medRaise
BCIT	46319.000000	46747.000000	45963.000000	932.000000	17.000000	466.000000
Circuit Court	56363.000000	60411.000000	54049.000000	0.000000	14.000000	0.000000
Convention Center	37752.000000	39271.000000	36167.000000	600.000000	14.000000	228.000000
DPW - Admin	55511.000000	57653.000000	53191.000000	0.000000	10.000000	0.000000
DPW - Solid Waste	35615.000000	37613.000000	34411.000000	0.000000	16.000000	0.000000
DPW - Waste & Wastewater	40646.000000	44151.000000	38411.000000	1636.000000	17.000000	552.666667
Enoch Pratt Free Library	40570.000000	42959.000000	37677.000000	0.000000	13.000000	0.000000
Finance	43155.000000	45589.000000	41515.000000	0.000000	14.000000	0.000000
Fire Department	70393.000000	73583.000000	62175.000000	6194.000000	18.000000	1446.400000
General Services	43784.000000	48637.000000	41771.000000	1159.000000	15.500000	487.250000
Health Department	42907.000000	44153.000000	40823.000000	0.000000	12.000000	0.000000
Housing & Community Dev	49871.000000	52639.000000	46583.000000	0.000000	14.000000	0.000000
Police Department	68165.000000	75183.000000	61183.000000	1470.000000	16.000000	685.000000
Rec & Parks	35615.000000	37571.000000	33849.000000	0.000000	9.000000	0.000000
Sheriffs Office	50696.000000	54059.000000	46495.000000	2870.000000	18.000000	1027.000000
States Attorneys Office	62481.000000	64957.000000	57699.000000	0.000000	10.000000	0.000000
Transportation	39745.000000	42455.000000	37553.000000	998.000000	17.000000	392.000000
Transportation - Crossing Guards	10822.000000	11875.000000	9891.000000	0.000000	16.000000	0.000000

The City That Works: An Exploratory Analysis of Baltimore City Employee Salaries¶

Purpose¶

Summary¶

Technical Notes and Disclosures¶

First Things First¶

Cleaning, Cleaning, Cleaning¶

Initial Aggregations¶

Employee-wise¶

Agency-wise¶

Whoa, that's a disparity¶

Closing thoughts¶