In [90]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from plotly.graph_objs import graph_objs as go
init_notebook_mode(connected=True)

import numpy as np

from collections import OrderedDict
import pandas as pd
import re

import ipywidgets as ipyw
from ipywidgets import interact, interactive, fixed, interact_manual, widgets

Physicians MSP billing in British Columbia for 2016

Data obtained on 2017-03-22 from:

In [4]:
inDataPath = r'P:\digitalLife\Projects\MD_Pay\updatedBlueBook.csv'
outDataPath = r'P:\digitalLife\Projects\MD_Pay\incomeList.csv'

mdDF = pd.read_csv(inDataPath, sep=';', encoding = "ISO-8859-1", thousands=',')
mdDF = mdDF[mdDF.fullName != "unknown"]
mdDF['income'] = pd.to_numeric(mdDF['income'], downcast='float', errors='coerce')

#incomeListSpecialty = mdDF[['subspecialty','income']].groupby('subspecialty').max().sort(['income'], ascending=[0])
incomeListSpecialty = mdDF[mdDF.specialty != 'General Family Practice'].sort_values(by=['income'], ascending=[0])
incomeListGP = mdDF[mdDF.specialty == 'General Family Practice'].sort_values(by=['income'], ascending=[0])

Incomes for GPs

In [5]:
dfGP = incomeListGP[['gender', 'income']]
figGP = ff.create_violin(dfGP, data_header='income', group_header='gender', height=500, width=800)
layout = figGP.layout
layout['title'] = 'Incomes for GPs'
layout['xaxis1']['autorange'] = True
layout['xaxis2']['autorange'] = True
figGP.layout.update(layout)
iplot(figGP, filename='GP incomes in BC')

dfGP.groupby('gender').count()
Out[5]:
income
gender
Female 2094
Male 3019

Incomes for Specialties

All Specialties

In [7]:
#there's something messed up with the last value in the dataframe
dfSpecialty = incomeListSpecialty[['gender', 'income']][0:4457]
figSpecialty = ff.create_violin(dfSpecialty, data_header='income', group_header='gender', height=500, width=800)
layout = figSpecialty.layout
layout['title'] = 'Incomes for All Specialties'
layout['xaxis1']['autorange'] = True
layout['xaxis2']['autorange'] = True
figSpecialty.layout.update(layout)
iplot(figSpecialty, filename='GP incomes specialty')

dfSpecialty.groupby('gender').count()
Out[7]:
income
gender
Female 1402
Male 3055
In [8]:
incomeSpecialtyListFiltered = incomeListSpecialty.copy()

f = lambda x: re.sub(r"^CCFP|", "", x)
incomeSpecialtyListFiltered['subspecialty']=incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"RCPSC\s\-\s", "", x)
incomeSpecialtyListFiltered['subspecialty']=incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"\+.*$", "", x)
incomeSpecialtyListFiltered['subspecialty']=incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"\|.*$", "", x)
incomeSpecialtyListFiltered['subspecialty']=incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"^\|", "", x)
#incomeSpecialtyListFiltered['subspecialty']=incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"^\s\-\s", "", x)
incomeSpecialtyListFiltered['subspecialty'] = incomeSpecialtyListFiltered['subspecialty'].map(f)
f = lambda x: re.sub(r"\s\-\s", "", x)
#incomeSpecialtyListFiltered['subspecialty'] = incomeSpecialtyListFiltered['subspecialty'].map(f)
membersCount = incomeSpecialtyListFiltered.groupby('subspecialty').count()['index']

#print(list(incomeListSpecialty['subspecialty'].unique()))
#print('---------------------')
specialtiesOfAll = list(incomeSpecialtyListFiltered.subspecialty.unique())
#print(specialtiesOfAll)

specialtiesOfFew = []
specialtiesOfMany = []
#print(membersCount.head())
for specialty in specialtiesOfAll:
    if membersCount[specialty] < 3:
        specialtiesOfFew.append(specialty)
    else:
        specialtiesOfMany.append(specialty)
        
specialtiesOfAll.sort()
specialtiesOfFew.sort()
specialtiesOfMany.sort()

Income by Specialty

In [10]:
def f(x):
    if (not x):
        print("Select a specialty from the drop down menu.")
        return
    
    dfSpecialty = incomeSpecialtyListFiltered[incomeSpecialtyListFiltered.subspecialty == x].sort_values(by=['income'], ascending=[0])[['gender', 'income']]
    specialtyCounts = dfSpecialty.groupby('gender').count()
    print(specialtyCounts)
    if (specialtyCounts.shape[0] > 1 and specialtyCounts['income']['Female'] > 2 and specialtyCounts['income']['Male'] > 2 ):
        figSpecialty = ff.create_violin(dfSpecialty, data_header='income', group_header='gender', height=500, width=800)
        layout = figSpecialty.layout
        layout['title'] = 'Incomes for ' + x + ' specialty'
        layout['xaxis1']['autorange'] = True
        layout['xaxis2']['autorange'] = True
        figSpecialty.layout.update(layout)
        iplot(figSpecialty, filename='GP incomes specialty')
    else:
        print("Too few doctors in the data set filtered by: " + x)
        print(dfSpecialty.to_string(index=False))

interact(f, x=specialtiesOfMany)
In [110]:
tempDF = incomeSpecialtyListFiltered.copy()
tempDF = tempDF[tempDF.subspecialty != 'Male']
#Calculate median for each specialty
specialtyMedianIncome = tempDF.groupby(['subspecialty'])['income'].median()
specialtyMedianIncome = specialtyMedianIncome.sort_values(ascending=False)
specialtyList = []
incomeMale = []
incomeFemale = []
#Order specialties by increasing medians
for specialty,income in specialtyMedianIncome.iteritems():
    #Calculate median for each gender in a specialty
    specialtyDF = tempDF[tempDF['subspecialty'] == specialty]
    medianByGender=specialtyDF.groupby(['gender']).median()
    medianByGender.reset_index(inplace = True)
    if medianByGender.shape[0] > 1:
        femaleMedian = medianByGender[medianByGender['gender']=='Female']['income'].values[0]
        maleMedian = medianByGender[medianByGender['gender']=='Male']['income'].values[0]
    else:
        if 'Female' in medianByGender['gender'][0]:
            femaleMedian = medianByGender[medianByGender['gender']=='Female']['income'].values[0]
            maleMedian = 0
        if 'Male' in medianByGender['gender'][0]:
            femaleMedian = 0
            maleMedian = medianByGender[medianByGender['gender']=='Male']['income'].values[0]
    incomeFemale.append(femaleMedian)
    incomeMale.append(maleMedian)
    specialtyList.append(specialty)
    #print(specialty + str(maleMedian) + "|" + str(femaleMedian))
In [109]:
#Plot median for each gender
#print(specialtyList)
#print(incomeMale)
#print(incomeFemale)
trace0 = go.Scatter(x = specialtyList, y = np.array(incomeMale), mode = 'lines+markers', name = 'Male')
trace1 = go.Scatter(x = specialtyList, y = np.array(incomeFemale), mode = 'lines+markers', name = 'Female')
data = [trace0, trace1]
layout = go.Layout(
    xaxis=dict(
        title='Specialty',
        titlefont=dict(
            family='Arial, sans-serif',
            size=12,
            color='lightgrey'
        ),
        showticklabels=True,
        tickangle=45,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=10,
            color='black'
        ),
    ),
    yaxis=dict(
        title='Income ($)',
        titlefont=dict(
            family='Arial, sans-serif',
            size=10,
            color='lightgrey'
        ),
        showticklabels=True,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=10,
            color='black'
        ),
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='scatter-mode')

Made using Jupyter Notebook.